How to Dynamically Update Input for an MLP Inference Pipeline Using ACL

I have implemented an MLP (Multi-Layer Perceptron) model using Arm Compute Library. The model receives input data through a pointer named mlp_input. However, the input data is not predetermined and needs to be computed dynamically by other modules in my program.

My current implementation follows this workflow:

Pass input pointer
Call do_setup()
Call do_run()
What I want to achieve is a more efficient pattern:

Call do_setup() once
Pass different input pointers multiple times (as data becomes available)
Call do_run() for each new input
Repeat steps 2-3 as needed
This approach would allow me to initialize the inference graph once and reuse it with different input data without rebuilding the graph each time, improving both code efficiency and performance. The specific code is as follows：




#include <arm_compute/core/Types.h>
#include <arm_compute/runtime/Tensor.h>
#include <arm_compute/core/TensorInfo.h>
#include <arm_compute/graph.h>
#include "arm_compute/graph.h"
#include "support/ToolchainSupport.h"
#include "utils/CommonGraphOptions.h"
#include "utils/GraphUtils.h"
#include "utils/Utils.h"
#include <iostream>
#include <cstring>
#include <cmath>
#include <memory>
#include <chrono>

constexpr bool ENABLE_VERBOSE = true;

void print_tensor(const float *ptr, const arm_compute::TensorInfo &info, const std::string &label)
{
    if (!ENABLE_VERBOSE) return;
    size_t total_size = info.tensor_shape().total_size();
    std::cout << "\n" << label << std::endl;
    std::cout << "Total Size: " << total_size << std::endl;
    std::cout << "Elements:" << std::endl;
    for (size_t i = 0; i < total_size; ++i)
    {
        if (i % 16 == 0) {
            if (i != 0) std::cout << std::endl;
            std::cout << "  [" << i << "] ";
        }
        std::cout << ptr[i] << " ";
    }
    std::cout << std::endl;
}

class DirectAccessor final : public arm_compute::graph::ITensorAccessor
{
private:
    float *_external_data;
    size_t _data_size;
    bool _first_call = true;

public:
    DirectAccessor(float *external_data, size_t size) 
        : _external_data(external_data), _data_size(size) {}
    
    bool access_tensor(arm_compute::ITensor &tensor) override
    {
        if (!_first_call) return false;
        _first_call = false;
        
        auto &dst_tensor = dynamic_cast<arm_compute::Tensor&>(tensor);
        size_t tensor_elements = dst_tensor.info()->tensor_shape().total_size();
        ARM_COMPUTE_ERROR_ON(tensor_elements != _data_size);

        size_t dst_offset_bytes = dst_tensor.info()->offset_first_element_in_bytes();
        size_t total_bytes = tensor_elements * sizeof(float);
        float *dst_ptr = reinterpret_cast<float*>(dst_tensor.buffer() + dst_offset_bytes);

        std::memcpy(dst_ptr, _external_data, total_bytes);

        return true;
    }
};

class OutputAccessor final : public arm_compute::graph::ITensorAccessor
{
public:
    bool access_tensor(arm_compute::ITensor &tensor) override
    {
        if (ENABLE_VERBOSE) {
            auto &tensor_ref = dynamic_cast<arm_compute::Tensor&>(tensor);
            size_t num_elements = tensor_ref.info()->tensor_shape().total_size();
            size_t offset_bytes = tensor_ref.info()->offset_first_element_in_bytes();
            float *data = reinterpret_cast<float*>(tensor_ref.buffer() + offset_bytes);
            for (size_t i = 0; i < num_elements; ++i)
            {
                std::cout << data[i] << " ";
            }
            std::cout << std::endl;
        }
        
        return true;
    }
};


class GraphExample
{
private:
    arm_compute::graph::frontend::Stream graph;
    static constexpr size_t NUM = 128;
    std::string DATA_PATH = "/home/firefly/ABC/ACL_TEST/fine_matcher_weights/";

public:
    float *external_input;

    GraphExample() : graph(0, "DataPassingGraph"){}
    
    void do_setup()
    {
        arm_compute::graph::TensorDescriptor input_desc(arm_compute::TensorShape(128), arm_compute::DataType::F32);
        
        graph << arm_compute::graph::Target::NEON
            << arm_compute::graph::frontend::InputLayer(input_desc, std::make_unique<DirectAccessor>(external_input, 128))
            << arm_compute::graph::frontend::FullyConnectedLayer(512,
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear1_weights.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear1_bias.npy"))
            << arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
            << arm_compute::graph::frontend::ScaleLayer(
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn1_scale.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn1_offset.npy"))
            << arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
            << arm_compute::graph::frontend::FullyConnectedLayer(512,
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear2_weights.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear2_bias.npy"))
            << arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
            << arm_compute::graph::frontend::ScaleLayer(
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn2_scale.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn2_offset.npy"))
            << arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
            << arm_compute::graph::frontend::FullyConnectedLayer(512,
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear3_weights.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear3_bias.npy"))
            << arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
            << arm_compute::graph::frontend::ScaleLayer(
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn3_scale.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn3_offset.npy"))
            << arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
            << arm_compute::graph::frontend::FullyConnectedLayer(512,
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear4_weights.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear4_bias.npy"))
            << arm_compute::graph::frontend::ReshapeLayer(arm_compute::TensorShape(1, 1, 512))
            << arm_compute::graph::frontend::ScaleLayer(
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn4_scale.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "bn4_offset.npy"))
            << arm_compute::graph::frontend::ActivationLayer(arm_compute::ActivationLayerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::RELU))
            << arm_compute::graph::frontend::FullyConnectedLayer(64,
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear5_weights.npy"),
                    arm_compute::graph_utils::get_weights_accessor(DATA_PATH, "linear5_bias.npy"))
            << arm_compute::graph::frontend::OutputLayer(std::make_unique<OutputAccessor>());

        arm_compute::graph::GraphConfig config;
        config.num_threads = 1;
        config.use_tuner = false;
        graph.finalize(arm_compute::graph::Target::NEON, config);
    }

    void do_run(int num_runs = 100)
    {
        auto start = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < num_runs; ++i) {
            graph.run();
        }
        auto end = std::chrono::high_resolution_clock::now();

        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        double avg_time_ms = duration.count() / 1000.0 / num_runs;
    }
};

int main()
{
    float mlp_input[128];
    float a[128];
    for (int i = 0; i < 128; i++) {
        mlp_input[i] = 0.64f;
        a[i] = 1.0f;
    }

    GraphExample example;
    example.external_input = mlp_input;
    
    example.do_setup();
    example.do_run(100);

    return 0;
}    

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to Dynamically Update Input for an MLP Inference Pipeline Using ACL #1255

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

How to Dynamically Update Input for an MLP Inference Pipeline Using ACL #1255

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions