-
Notifications
You must be signed in to change notification settings - Fork 815
Description
I have implemented an MLP (Multi-Layer Perceptron) model using Arm Compute Library. The model receives input data through a pointer named mlp_input. However, the input data is not predetermined and needs to be computed dynamically by other modules in my program.
My current implementation follows this workflow:
Pass input pointer
Call do_setup()
Call do_run()
What I want to achieve is a more efficient pattern:
Call do_setup() once
Pass different input pointers multiple times (as data becomes available)
Call do_run() for each new input
Repeat steps 2-3 as needed
This approach would allow me to initialize the inference graph once and reuse it with different input data without rebuilding the graph each time, improving both code efficiency and performance. The specific code is as follows:
#include <arm_compute/core/Types.h>
#include <arm_compute/runtime/Tensor.h>
#include <arm_compute/core/TensorInfo.h>
#include <arm_compute/graph.h>
#include "arm_compute/graph.h"
#include "support/ToolchainSupport.h"
#include "utils/CommonGraphOptions.h"
#include "utils/GraphUtils.h"
#include "utils/Utils.h"
#include
#include
#include
#include
#include
constexpr bool ENABLE_VERBOSE = true;
void print_tensor(const float *ptr, const arm_compute::TensorInfo &info, const std::string &label)
{
if (!ENABLE_VERBOSE) return;
size_t total_size = info.tensor_shape().total_size();
std::cout << "\n" << label << std::endl;
std::cout << "Total Size: " << total_size << std::endl;
std::cout << "Elements:" << std::endl;
for (size_t i = 0; i < total_size; ++i)
{
if (i % 16 == 0) {
if (i != 0) std::cout << std::endl;
std::cout << " [" << i << "] ";
}
std::cout << ptr[i] << " ";
}
std::cout << std::endl;
}
class DirectAccessor final : public arm_compute::graph::ITensorAccessor
{
private:
float *_external_data;
size_t _data_size;
bool _first_call = true;
public:
DirectAccessor(float *external_data, size_t size)
: _external_data(external_data), _data_size(size) {}
bool access_tensor(arm_compute::ITensor &tensor) override
{
if (!_first_call) return false;
_first_call = false;
auto &dst_tensor = dynamic_cast<arm_compute::Tensor&>(tensor);
size_t tensor_elements = dst_tensor.info()->tensor_shape().total_size();
ARM_COMPUTE_ERROR_ON(tensor_elements != _data_size);
size_t dst_offset_bytes = dst_tensor.info()->offset_first_element_in_bytes();
size_t total_bytes = tensor_elements * sizeof(float);
float *dst_ptr = reinterpret_cast<float*>(dst_tensor.buffer() + dst_offset_bytes);
std::memcpy(dst_ptr, _external_data, total_bytes);
return true;
}
};
class OutputAccessor final : public arm_compute::graph::ITensorAccessor
{
public:
bool access_tensor(arm_compute::ITensor &tensor) override
{
if (ENABLE_VERBOSE) {
auto &tensor_ref = dynamic_cast<arm_compute::Tensor&>(tensor);
size_t num_elements = tensor_ref.info()->tensor_shape().total_size();
size_t offset_bytes = tensor_ref.info()->offset_first_element_in_bytes();
float data = reinterpret_cast<float>(tensor_ref.buffer() + offset_bytes);
for (size_t i = 0; i < num_elements; ++i)
{
std::cout << data[i] << " ";
}
std::cout << std::endl;
}
return true;
}
};
class GraphExample
{
private:
arm_compute::graph::frontend::Stream graph;
static constexpr size_t NUM = 128;
std::string DATA_PATH = "/home/firefly/ABC/ACL_TEST/fine_matcher_weights/";
public:
float *external_input;
GraphExample() : graph(0, "DataPassingGraph"){}
void do_setup()
{
arm_compute::graph::TensorDescriptor input_desc(arm_compute::TensorShape(128), arm_compute::DataType::F32);
graph << arm_compute::graph::Target::NEON
<< arm_compute::graph::frontend::InputLayer(input_desc, std::make_unique<DirectAccessor>(external_input, 128))
<< arm_compute::graph::frontend::FullyConnectedLayer(512,
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear1_weights.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear1_bias.npy"))
<< arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
<< arm_compute::graph::frontend::ScaleLayer(
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn1_scale.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn1_offset.npy"))
<< arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
<< arm_compute::graph::frontend::FullyConnectedLayer(512,
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear2_weights.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear2_bias.npy"))
<< arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
<< arm_compute::graph::frontend::ScaleLayer(
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn2_scale.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn2_offset.npy"))
<< arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
<< arm_compute::graph::frontend::FullyConnectedLayer(512,
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear3_weights.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear3_bias.npy"))
<< arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
<< arm_compute::graph::frontend::ScaleLayer(
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn3_scale.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn3_offset.npy"))
<< arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
<< arm_compute::graph::frontend::FullyConnectedLayer(512,
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear4_weights.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear4_bias.npy"))
<< arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
<< arm_compute::graph::frontend::ScaleLayer(
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn4_scale.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn4_offset.npy"))
<< arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
<< arm_compute::graph::frontend::FullyConnectedLayer(64,
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear5_weights.npy"),
arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear5_bias.npy"))
<< arm_compute::graph::frontend::OutputLayer(std::make_unique<OutputAccessor>());
arm_compute::graph::GraphConfig config;
config.num_threads = 1;
config.use_tuner = false;
graph.finalize(arm_compute::graph::Target::NEON, config);
}
void do_run(int num_runs = 100)
{
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_runs; ++i) {
graph.run();
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
double avg_time_ms = duration.count() / 1000.0 / num_runs;
}
};
int main()
{
float mlp_input[128];
float a[128];
for (int i = 0; i < 128; i++) {
mlp_input[i] = 0.64f;
a[i] = 1.0f;
}
GraphExample example;
example.external_input = mlp_input;
example.do_setup();
example.do_run(100);
return 0;
}