TfLite NPU run error op_layout_inference.cc:MapAxis:177 Map axis failed

astel_ · ‎09-04-2024

Hello everyone

Sorry for asking. I tried to build my own C++ program which is a converted program from python , to run a simple model to detect people for my board imx8m plus.

The Code:

1. main.cpp

// main.cpp
#include "detector.h"
#include <opencv2/opencv.hpp>

int main(int argc, char* argv[]) {
    if (argc != 2) {
        std::cerr << "Usage: " << argv[0] << " <image_path>" << std::endl;
        return 1;
    }

    std::string image_path = argv[1];
    std::string model_path = "model.tflite";
    std::string delegate_path = "/usr/lib/libvx_delegate.so";
    cv::Size input_size(192, 192);
    float score_th = 0.5;
    float nms_th = 0.4;

    Detector detector(model_path, delegate_path, input_size, score_th, nms_th);
    if (!detector.init_model()) {
        return 1;
    }

    cv::Mat image = cv::imread(image_path);
    if (image.empty()) {
        std::cerr << "Failed to load image from " << image_path << std::endl;
        return 1;
    }

    auto [bboxes, scores] = detector.detect(image);

    for (size_t i = 0; i < bboxes.size(); ++i) {
        cv::rectangle(image, bboxes[i], cv::Scalar(0, 255, 0), 2);
        std::cout << "Detected bbox: " << bboxes[i] << " with score: " << scores[i] << std::endl;
    }

    // cv::imshow("Detections", image);
    cv::waitKey(0);

    return 0;
}

2. detector.h

// detector.h
#ifndef DETECTOR_H
#define DETECTOR_H

#include <opencv2/opencv.hpp>
#include <tensorflow/lite/interpreter.h>
#include <tensorflow/lite/kernels/register.h>
#include <tensorflow/lite/model.h>
#include <tensorflow/lite/optional_debug_tools.h>
#include <tensorflow/lite/delegates/external/external_delegate.h>
#include <tensorflow-lite-vx-delegate/vsi_npu_custom_op.h>
#include "delegate_main.h"

class Detector {
public:
    Detector(const std::string& model_path, 
             const std::string& delegate_path,
             const cv::Size& input_shape,
             float score_th,
             float nms_th);

    bool init_model();
    std::pair<std::vector<cv::Rect>, std::vector<float>> detect(const cv::Mat& image);

private:
    std::string model_path_;
    std::string delegate_path_;
    cv::Size input_shape_;
    float score_th_;
    float nms_th_;
    std::unique_ptr<tflite::Interpreter> interpreter_;

    std::pair<cv::Mat, float> preprocess(const cv::Mat& image, const cv::Size& input_size);
    std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess(cv::Mat& outputs, 
                                                                                        const cv::Size& img_size, 
                                                                                        float ratio, 
                                                                                        float score_th, 
                                                                                        float nms_th);
    void meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv);
    std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> nms(const std::vector<cv::Rect>& bboxes, 
                                                                                const std::vector<float>& scores, 
                                                                                float score_th, 
                                                                                float nms_th);
};

#endif // DETECTOR_H

3. detector.cpp

// detector.cpp
#include "detector.h"
#include <iostream>

Detector::Detector(const std::string& model_path, 
                   const std::string& delegate_path,
                   const cv::Size& input_shape,
                   float score_th,
                   float nms_th)
    : model_path_(model_path),
      delegate_path_(delegate_path),
      input_shape_(input_shape),
      score_th_(score_th),
      nms_th_(nms_th) {}

bool Detector::init_model() {
    auto model = tflite::FlatBufferModel::BuildFromFile(model_path_.c_str());
    if (!model) {
        std::cerr << "Failed to load model from " << model_path_ << std::endl;
        return false;
    }

    auto ext_delegate_option = TfLiteExternalDelegateOptionsDefault(delegate_path_.c_str());
    auto ext_delegate_ptr = TfLiteExternalDelegateCreate(&ext_delegate_option);
    if (!ext_delegate_ptr) {
        std::cerr << "Failed to create external delegate" << std::endl;
        return false;
    }

    tflite::ops::builtin::BuiltinOpResolver resolver;
    resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());

    tflite::InterpreterBuilder builder(*model, resolver);
    
    builder(&interpreter_);
    if (!interpreter_) {
        std::cerr << "Failed to build interpreter" << std::endl;
        return false;
    }

    interpreter_->ModifyGraphWithDelegate(ext_delegate_ptr);
    if (interpreter_->AllocateTensors() != kTfLiteOk) {
        std::cerr << "Failed to allocate tensors" << std::endl;
        return false;
    }

    return true;
}

std::pair<cv::Mat, float> Detector::preprocess(const cv::Mat& image, const cv::Size& input_size) {
    float ratio = std::min(static_cast<float>(input_size.width) / image.cols,
                           static_cast<float>(input_size.height) / image.rows);

    cv::Size new_size(static_cast<int>(image.cols * ratio), static_cast<int>(image.rows * ratio));
    cv::Mat resized_image;
    cv::resize(image, resized_image, new_size, 0, 0, cv::INTER_LINEAR);

    cv::Mat padded_image = cv::Mat::ones(input_size, CV_8UC3) * 114;
    resized_image.copyTo(padded_image(cv::Rect(0, 0, resized_image.cols, resized_image.rows)));

    std::vector<cv::Mat> channels(3);
    cv::split(padded_image, channels);

    cv::Mat chw_image(3, input_size.height * input_size.width, CV_32F);
    for(int i = 0; i < 3; ++i) {
        channels[i].convertTo(channels[i], CV_32F);
        std::memcpy(chw_image.ptr<float>(i), channels[i].data, channels[i].total() * sizeof(float));
    }

    cv::Mat reshaped_image = chw_image.reshape(1, {1, 3, input_size.height, input_size.width});
    return std::make_pair(reshaped_image, ratio);
}

std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::postprocess(cv::Mat& outputs, 
                                                                                            const cv::Size& img_size, 
                                                                                            float ratio, 
                                                                                            float score_th, 
                                                                                            float nms_th) {
    std::vector<cv::Rect> bboxes;
    std::vector<float> scores;
    std::vector<int> class_ids;

    std::vector<int> strides = {8, 16, 32};
    std::vector<cv::Mat> grids;
    std::vector<cv::Mat> expanded_strides;

    for (int stride : strides) {
        int hsize = img_size.height / stride;
        int wsize = img_size.width / stride;

        cv::Mat xv, yv;
        meshgrid(cv::Range(0, wsize - 1), cv::Range(0, hsize - 1), xv, yv);
        cv::Mat grid;
        cv::hconcat(xv.reshape(1, 1), yv.reshape(1, 1), grid);

        grids.push_back(grid.reshape(2, 1));
        expanded_strides.push_back(cv::Mat(grid.size(), CV_32F, cv::Scalar(stride)));
    }

    cv::Mat grid_cat, stride_cat;
    cv::vconcat(grids, grid_cat);
    cv::vconcat(expanded_strides, stride_cat);

    outputs.colRange(2, 4).convertTo(outputs.colRange(2, 4), CV_32F);
    
    cv::Mat exp_colRange(outputs.colRange(2, 4).size(), CV_32F);
    cv::exp(outputs.colRange(2, 4), exp_colRange);
    
    outputs.colRange(0, 2) = (outputs.colRange(0, 2) + grid_cat) * stride_cat;
    outputs.colRange(2, 4) = exp_colRange.mul(stride_cat);

    cv::Mat predictions = outputs.row(0);
    cv::Mat bboxes_mat = predictions.colRange(0, 4);
    cv::Mat scores_mat = predictions.col(4).mul(predictions.colRange(5, predictions.cols));
    scores.assign(scores_mat.begin<float>(), scores_mat.end<float>());

    std::vector<cv::Rect> bboxes_xyxy(bboxes_mat.rows);
    for (int i = 0; i < bboxes_mat.rows; ++i) {
        float x_center = bboxes_mat.at<float>(i, 0);
        float y_center = bboxes_mat.at<float>(i, 1);
        float width = bboxes_mat.at<float>(i, 2);
        float height = bboxes_mat.at<float>(i, 3);

        float x_min = x_center - width / 2.0;
        float y_min = y_center - height / 2.0;
        float x_max = x_center + width / 2.0;
        float y_max = y_center + height / 2.0;

        bboxes_xyxy[i] = cv::Rect(cv::Point(x_min / ratio, y_min / ratio), cv::Point(x_max / ratio, y_max / ratio));
    }

    return nms(bboxes_xyxy, scores, score_th, nms_th);
}

void Detector::meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv) {
    cv::Mat x_coords = cv::Mat(x_range.size(), 1, CV_32F);
    cv::Mat y_coords = cv::Mat(y_range.size(), 1, CV_32F);
    
    for (int i = 0; i < x_range.size(); ++i) {
        x_coords.at<float>(i,0) = x_range.start + i;
    }
    
    for (int i = 0; i < y_range.size(); ++i) {
        y_coords.at<float>(i,0) = y_range.start + i;
    } 
    
    cv::repeat(x_coords, 1, y_range.size(), xv);
    cv::repeat(y_coords.t(), x_range.size(), 1, yv);
}

std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::nms(const std::vector<cv::Rect>& bboxes, 
                                                                                        const std::vector<float>& scores, 
                                                                                        float score_th, 
                                                                                        float nms_th) {
    std::vector<cv::Rect> bboxes_filtered;
    std::vector<float> scores_filtered;
    std::vector<int> class_ids_filtered;

    std::vector<int> indices;
    cv::dnn::NMSBoxes(bboxes, scores, score_th, nms_th, indices);
    
    for(int idx : indices) {
        bboxes_filtered.push_back(bboxes[idx]);
        scores_filtered.push_back(scores[idx]);
        class_ids_filtered.push_back(0);
    }

    return std::make_tuple(bboxes_filtered, scores_filtered, class_ids_filtered);
}

std::pair<std::vector<cv::Rect>, std::vector<float>> Detector::detect(const cv::Mat& image) {
    cv::Mat temp_image = image.clone();
    
    auto [preprocessed_image, ratio] = preprocess(temp_image, input_shape_);
    
    std::cout << "Preprocess Completed"<<std::endl;

    // Setting input tensor
    TfLiteTensor* input_data = interpreter_->tensor(interpreter_->inputs()[0]);
    const uint input_width = input_data->dims->data[3];
    const uint input_height = input_data->dims->data[2];
    const uint input_channels = input_data->dims->data[1];
    const uint batch_size = input_data->dims->data[0];

    std::cout << "Expected dimension: "<< batch_size << "x" << input_channels << "x" << input_height << "x" << input_width << std::endl;

    const uint image_width = preprocessed_image.size[3];
    const uint image_height = preprocessed_image.size[2];
    const uint image_channels = preprocessed_image.size[1];
    const uint image_batch_size = preprocessed_image.size[0];

    std::cout << "Image dimension: "<< image_batch_size << "x" << image_channels << "x" << image_height << "x" << image_width << std::endl;

    if(input_data->type !=kTfLiteFloat32){
        std::cerr << "input tensor is not of type float" << std::endl;
        return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
    }

    if(input_data->data.f == nullptr) {
        std::cerr << "input tensor data pointer is null" << std::endl;
        return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
    }
    std::memcpy(input_data->data.f, preprocessed_image.ptr<float>(0), batch_size * input_width * input_height * input_channels * sizeof(float));

    if(memcmp(input_data->data.f, preprocessed_image.ptr<float>(0),batch_size * input_width * input_height * input_channels * sizeof(float)) != 0){
        std::cerr << "data copy to input tensor failed" << std::endl;
        return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
    }
    else{
        std::cout << "Set up Input Tensor Completed"<<std::endl;
    }

    // Running inference
    interpreter_->Invoke();

    std::cout << "Inference Completed"<<std::endl;

    // Getting output tensor
    float* output_tensor = interpreter_->typed_output_tensor<float>(0);
    size_t output_size = interpreter_->tensor(interpreter_->outputs()[0])->bytes / sizeof(float);    
    cv::Mat results(1, output_size, CV_32F, output_tensor);

    std::cout << "Get Results Completed"<<std::endl;

    // Postprocessing
    auto [bboxes_xyxy, scores, class_ids] = postprocess(results, input_shape_, ratio, score_th_, nms_th_);

    // Converting the bboxes to cv::Rect and packing results
    std::vector<cv::Rect> result_rect_list;
    for (size_t i = 0; i < bboxes_xyxy.size(); ++i) {
        result_rect_list.push_back(bboxes_xyxy[i]);
    }

    // Returning the list of rectangles and the associated scores
    return {result_rect_list, scores};
}

my board image is nanbield 6.6.3_1.0.0 full image

I tried to run it using VX Delegate and NPU and encounter a problem when running the code

root@imx8mpevk:/run/media/SD CARD-sda1/test_npu# ./detector_app lena_color_512.tif
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
E [/usr/src/debug/tim-vx/1.1.88-r[  126.612163] audit: type=1701 audit(1695250801.923:18): auid=4294967295 uid=0 gid=0 ses=4294967295 pid=1270 comm="detector_app" exe=2F72756E2F6D656469612F534420434152442D736461312F746573745F6E70752F6465746563746F725F617070 sig=6 res=1
0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.
Aborted (core dumped)

I also tried to get the gdb debug running, and it return something like this:

(gdb) set args lena_color_512.tif
(gdb) run
Starting program: /run/media/SD CARD-sda1/test_npu/detector_app lena_color_512.tif
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
[New Thread 0xfffff146cf00 (LWP 1660)]
E [/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.

Thread 1 "detector_app" received signal SIGABRT, Aborted.
__pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
44      pthread_kill.c: No such file or directory.
(gdb) bt
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
#1  0x0000fffff69c0568 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2  0x0000fffff697acd0 in __GI_raise (sig=sig@entry=6) at /usr/src/debug/glibc/2.38+git-r0/sysdeps/posix/raise.c:26
#3  0x0000fffff6966ef0 in __GI_abort () at abort.c:79
#4  0x0000fffff69743f8 in __assert_fail_base (fmt=0xfffff6a8a8e8 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0xfffff1ffdcf0 "false",
    file=file@entry=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=line@entry=178,
    function=function@entry=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:92
#5  0x0000fffff6974470 in __assert_fail (assertion=0xfffff1ffdcf0 "false", file=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=178,
    function=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:101
#6  0x0000fffff1fa5f74 in tim::transform::OpLayoutInfer::MapAxis(std::vector<unsigned int, std::allocator<unsigned int> > const&, unsigned int) () from /usr/lib/libtim-vx.so
#7  0x0000fffff1f6a1b0 in ?? () from /usr/lib/libtim-vx.so
#8  0x0000fffff1f4e5f4 in tim::transform::layout_inference_impl::HandleLayoutInfer(std::shared_ptr<tim::transform::layout_inference_impl::LayoutInferContext>&, std::shared_ptr<tim::vx::Operation> const&) () from /usr/lib/libtim-vx.so
#9  0x0000fffff1f531f4 in tim::transform::LayoutInference(std::shared_ptr<tim::vx::Graph> const&, std::shared_ptr<tim::vx::Context>&, std::map<std::shared_ptr<tim::vx::Tensor>, std::shared_ptr<tim::transform::IPermuteVector>, std::less<std::shared_ptr<tim::vx::Tensor> >, std::allocator<std::pair<std::shared_ptr<tim::vx::Tensor> const, std::shared_ptr<tim::transform::IPermuteVector> > > >) () from /usr/lib/libtim-vx.so
#10 0x0000fffff23d85ac in vx::delegate::Delegate::Invoke(vx::delegate::OpData const&, TfLiteContext*, TfLiteNode*) () from /usr/lib/libvx_delegate.so
#11 0x0000fffff7be9d9c in tflite::Subgraph::InvokeImpl() () from /usr/lib/libtensorflow-lite.so.2.14.0
#12 0x0000fffff7bea388 in tflite::Subgraph::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#13 0x0000fffff7bd440c in tflite::impl::Interpreter::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#14 0x0000aaaaaaaa62e0 in Detector::detect (this=this@entry=0xfffffffff890, image=...)
    at /home/ubuntu/imx-yocto-bsp/sdk/sysroots/armv8a-poky-linux/usr/include/c++/13.2.0/bits/unique_ptr.h:199
#15 0x0000aaaaaaaa35b0 in main (argc=<optimized out>, argv=<optimized out>) at /home/ubuntu/imx-yocto-bsp/tflite_test/build_minim/main.cpp:29

Does anyone have a clue what is wrong? because I am not sure what happened here. but what i only know that the assertion at op_layout_inference.cc:MapAxis:177 Map axis failed because of assertion error (?)

Thank you in advance

astel_ · ‎09-16-2024

I found the cause of the problem. Apparently this line caused the error:

resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());

So for now, i just disable it and magically it works. Maybe someone can explain why it trigger the error, but for now I can finally continue with my app development.

For the model, I check it using Python code, and apparently no error, so the model itself is compatible with the NPU run.

Thank you

View solution in original post

Bio_TICFSL · ‎09-04-2024

Hello,

It looks tile the assertion is there to say that as far as you aware, has made it impossible to call the zero-args constructor according is private and so if a call occurs, that assertion has been violated per your error.

Regards

astel_ · ‎09-04-2024

is it possible that the model is not compatible with the NPU/VX delegate run?

bc when i try to run it with CPU i got different error (related to one of the StridedSlice layer, but i havent check it properly yet for CPU run)

astel_ · ‎09-16-2024

I found the cause of the problem. Apparently this line caused the error:

resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());

So for now, i just disable it and magically it works. Maybe someone can explain why it trigger the error, but for now I can finally continue with my app development.

For the model, I check it using Python code, and apparently no error, so the model itself is compatible with the NPU run.

Thank you

TfLite NPU run error op_layout_inference.cc:MapAxis:177 Map axis failed

TfLite NPU run error op_layout_inference.cc:MapAxis:177 Map axis failed

i.MX 8 Family | i.MX 8QuadMax (8QM) | 8QuadPlus