Hello everyone
Sorry for asking. I tried to build my own C++ program which is a converted program from python , to run a simple model to detect people for my board imx8m plus.
The Code:
1. main.cpp
// main.cpp
#include "detector.h"
#include <opencv2/opencv.hpp>
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <image_path>" << std::endl;
return 1;
}
std::string image_path = argv[1];
std::string model_path = "model.tflite";
std::string delegate_path = "/usr/lib/libvx_delegate.so";
cv::Size input_size(192, 192);
float score_th = 0.5;
float nms_th = 0.4;
Detector detector(model_path, delegate_path, input_size, score_th, nms_th);
if (!detector.init_model()) {
return 1;
}
cv::Mat image = cv::imread(image_path);
if (image.empty()) {
std::cerr << "Failed to load image from " << image_path << std::endl;
return 1;
}
auto [bboxes, scores] = detector.detect(image);
for (size_t i = 0; i < bboxes.size(); ++i) {
cv::rectangle(image, bboxes[i], cv::Scalar(0, 255, 0), 2);
std::cout << "Detected bbox: " << bboxes[i] << " with score: " << scores[i] << std::endl;
}
// cv::imshow("Detections", image);
cv::waitKey(0);
return 0;
}
2. detector.h
// detector.h
#ifndef DETECTOR_H
#define DETECTOR_H
#include <opencv2/opencv.hpp>
#include <tensorflow/lite/interpreter.h>
#include <tensorflow/lite/kernels/register.h>
#include <tensorflow/lite/model.h>
#include <tensorflow/lite/optional_debug_tools.h>
#include <tensorflow/lite/delegates/external/external_delegate.h>
#include <tensorflow-lite-vx-delegate/vsi_npu_custom_op.h>
#include "delegate_main.h"
class Detector {
public:
Detector(const std::string& model_path,
const std::string& delegate_path,
const cv::Size& input_shape,
float score_th,
float nms_th);
bool init_model();
std::pair<std::vector<cv::Rect>, std::vector<float>> detect(const cv::Mat& image);
private:
std::string model_path_;
std::string delegate_path_;
cv::Size input_shape_;
float score_th_;
float nms_th_;
std::unique_ptr<tflite::Interpreter> interpreter_;
std::pair<cv::Mat, float> preprocess(const cv::Mat& image, const cv::Size& input_size);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess(cv::Mat& outputs,
const cv::Size& img_size,
float ratio,
float score_th,
float nms_th);
void meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> nms(const std::vector<cv::Rect>& bboxes,
const std::vector<float>& scores,
float score_th,
float nms_th);
};
#endif // DETECTOR_H
3. detector.cpp
// detector.cpp
#include "detector.h"
#include <iostream>
Detector::Detector(const std::string& model_path,
const std::string& delegate_path,
const cv::Size& input_shape,
float score_th,
float nms_th)
: model_path_(model_path),
delegate_path_(delegate_path),
input_shape_(input_shape),
score_th_(score_th),
nms_th_(nms_th) {}
bool Detector::init_model() {
auto model = tflite::FlatBufferModel::BuildFromFile(model_path_.c_str());
if (!model) {
std::cerr << "Failed to load model from " << model_path_ << std::endl;
return false;
}
auto ext_delegate_option = TfLiteExternalDelegateOptionsDefault(delegate_path_.c_str());
auto ext_delegate_ptr = TfLiteExternalDelegateCreate(&ext_delegate_option);
if (!ext_delegate_ptr) {
std::cerr << "Failed to create external delegate" << std::endl;
return false;
}
tflite::ops::builtin::BuiltinOpResolver resolver;
resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());
tflite::InterpreterBuilder builder(*model, resolver);
builder(&interpreter_);
if (!interpreter_) {
std::cerr << "Failed to build interpreter" << std::endl;
return false;
}
interpreter_->ModifyGraphWithDelegate(ext_delegate_ptr);
if (interpreter_->AllocateTensors() != kTfLiteOk) {
std::cerr << "Failed to allocate tensors" << std::endl;
return false;
}
return true;
}
std::pair<cv::Mat, float> Detector::preprocess(const cv::Mat& image, const cv::Size& input_size) {
float ratio = std::min(static_cast<float>(input_size.width) / image.cols,
static_cast<float>(input_size.height) / image.rows);
cv::Size new_size(static_cast<int>(image.cols * ratio), static_cast<int>(image.rows * ratio));
cv::Mat resized_image;
cv::resize(image, resized_image, new_size, 0, 0, cv::INTER_LINEAR);
cv::Mat padded_image = cv::Mat::ones(input_size, CV_8UC3) * 114;
resized_image.copyTo(padded_image(cv::Rect(0, 0, resized_image.cols, resized_image.rows)));
std::vector<cv::Mat> channels(3);
cv::split(padded_image, channels);
cv::Mat chw_image(3, input_size.height * input_size.width, CV_32F);
for(int i = 0; i < 3; ++i) {
channels[i].convertTo(channels[i], CV_32F);
std::memcpy(chw_image.ptr<float>(i), channels[i].data, channels[i].total() * sizeof(float));
}
cv::Mat reshaped_image = chw_image.reshape(1, {1, 3, input_size.height, input_size.width});
return std::make_pair(reshaped_image, ratio);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::postprocess(cv::Mat& outputs,
const cv::Size& img_size,
float ratio,
float score_th,
float nms_th) {
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> class_ids;
std::vector<int> strides = {8, 16, 32};
std::vector<cv::Mat> grids;
std::vector<cv::Mat> expanded_strides;
for (int stride : strides) {
int hsize = img_size.height / stride;
int wsize = img_size.width / stride;
cv::Mat xv, yv;
meshgrid(cv::Range(0, wsize - 1), cv::Range(0, hsize - 1), xv, yv);
cv::Mat grid;
cv::hconcat(xv.reshape(1, 1), yv.reshape(1, 1), grid);
grids.push_back(grid.reshape(2, 1));
expanded_strides.push_back(cv::Mat(grid.size(), CV_32F, cv::Scalar(stride)));
}
cv::Mat grid_cat, stride_cat;
cv::vconcat(grids, grid_cat);
cv::vconcat(expanded_strides, stride_cat);
outputs.colRange(2, 4).convertTo(outputs.colRange(2, 4), CV_32F);
cv::Mat exp_colRange(outputs.colRange(2, 4).size(), CV_32F);
cv::exp(outputs.colRange(2, 4), exp_colRange);
outputs.colRange(0, 2) = (outputs.colRange(0, 2) + grid_cat) * stride_cat;
outputs.colRange(2, 4) = exp_colRange.mul(stride_cat);
cv::Mat predictions = outputs.row(0);
cv::Mat bboxes_mat = predictions.colRange(0, 4);
cv::Mat scores_mat = predictions.col(4).mul(predictions.colRange(5, predictions.cols));
scores.assign(scores_mat.begin<float>(), scores_mat.end<float>());
std::vector<cv::Rect> bboxes_xyxy(bboxes_mat.rows);
for (int i = 0; i < bboxes_mat.rows; ++i) {
float x_center = bboxes_mat.at<float>(i, 0);
float y_center = bboxes_mat.at<float>(i, 1);
float width = bboxes_mat.at<float>(i, 2);
float height = bboxes_mat.at<float>(i, 3);
float x_min = x_center - width / 2.0;
float y_min = y_center - height / 2.0;
float x_max = x_center + width / 2.0;
float y_max = y_center + height / 2.0;
bboxes_xyxy[i] = cv::Rect(cv::Point(x_min / ratio, y_min / ratio), cv::Point(x_max / ratio, y_max / ratio));
}
return nms(bboxes_xyxy, scores, score_th, nms_th);
}
void Detector::meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv) {
cv::Mat x_coords = cv::Mat(x_range.size(), 1, CV_32F);
cv::Mat y_coords = cv::Mat(y_range.size(), 1, CV_32F);
for (int i = 0; i < x_range.size(); ++i) {
x_coords.at<float>(i,0) = x_range.start + i;
}
for (int i = 0; i < y_range.size(); ++i) {
y_coords.at<float>(i,0) = y_range.start + i;
}
cv::repeat(x_coords, 1, y_range.size(), xv);
cv::repeat(y_coords.t(), x_range.size(), 1, yv);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::nms(const std::vector<cv::Rect>& bboxes,
const std::vector<float>& scores,
float score_th,
float nms_th) {
std::vector<cv::Rect> bboxes_filtered;
std::vector<float> scores_filtered;
std::vector<int> class_ids_filtered;
std::vector<int> indices;
cv::dnn::NMSBoxes(bboxes, scores, score_th, nms_th, indices);
for(int idx : indices) {
bboxes_filtered.push_back(bboxes[idx]);
scores_filtered.push_back(scores[idx]);
class_ids_filtered.push_back(0);
}
return std::make_tuple(bboxes_filtered, scores_filtered, class_ids_filtered);
}
std::pair<std::vector<cv::Rect>, std::vector<float>> Detector::detect(const cv::Mat& image) {
cv::Mat temp_image = image.clone();
auto [preprocessed_image, ratio] = preprocess(temp_image, input_shape_);
std::cout << "Preprocess Completed"<<std::endl;
// Setting input tensor
TfLiteTensor* input_data = interpreter_->tensor(interpreter_->inputs()[0]);
const uint input_width = input_data->dims->data[3];
const uint input_height = input_data->dims->data[2];
const uint input_channels = input_data->dims->data[1];
const uint batch_size = input_data->dims->data[0];
std::cout << "Expected dimension: "<< batch_size << "x" << input_channels << "x" << input_height << "x" << input_width << std::endl;
const uint image_width = preprocessed_image.size[3];
const uint image_height = preprocessed_image.size[2];
const uint image_channels = preprocessed_image.size[1];
const uint image_batch_size = preprocessed_image.size[0];
std::cout << "Image dimension: "<< image_batch_size << "x" << image_channels << "x" << image_height << "x" << image_width << std::endl;
if(input_data->type !=kTfLiteFloat32){
std::cerr << "input tensor is not of type float" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
if(input_data->data.f == nullptr) {
std::cerr << "input tensor data pointer is null" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
std::memcpy(input_data->data.f, preprocessed_image.ptr<float>(0), batch_size * input_width * input_height * input_channels * sizeof(float));
if(memcmp(input_data->data.f, preprocessed_image.ptr<float>(0),batch_size * input_width * input_height * input_channels * sizeof(float)) != 0){
std::cerr << "data copy to input tensor failed" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
else{
std::cout << "Set up Input Tensor Completed"<<std::endl;
}
// Running inference
interpreter_->Invoke();
std::cout << "Inference Completed"<<std::endl;
// Getting output tensor
float* output_tensor = interpreter_->typed_output_tensor<float>(0);
size_t output_size = interpreter_->tensor(interpreter_->outputs()[0])->bytes / sizeof(float);
cv::Mat results(1, output_size, CV_32F, output_tensor);
std::cout << "Get Results Completed"<<std::endl;
// Postprocessing
auto [bboxes_xyxy, scores, class_ids] = postprocess(results, input_shape_, ratio, score_th_, nms_th_);
// Converting the bboxes to cv::Rect and packing results
std::vector<cv::Rect> result_rect_list;
for (size_t i = 0; i < bboxes_xyxy.size(); ++i) {
result_rect_list.push_back(bboxes_xyxy[i]);
}
// Returning the list of rectangles and the associated scores
return {result_rect_list, scores};
}
my board image is nanbield 6.6.3_1.0.0 full image
I tried to run it using VX Delegate and NPU and encounter a problem when running the code
root@imx8mpevk:/run/media/SD CARD-sda1/test_npu# ./detector_app lena_color_512.tif
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
E [/usr/src/debug/tim-vx/1.1.88-r[ 126.612163] audit: type=1701 audit(1695250801.923:18): auid=4294967295 uid=0 gid=0 ses=4294967295 pid=1270 comm="detector_app" exe=2F72756E2F6D656469612F534420434152442D736461312F746573745F6E70752F6465746563746F725F617070 sig=6 res=1
0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.
Aborted (core dumped)
I also tried to get the gdb debug running, and it return something like this:
(gdb) set args lena_color_512.tif
(gdb) run
Starting program: /run/media/SD CARD-sda1/test_npu/detector_app lena_color_512.tif
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
[New Thread 0xfffff146cf00 (LWP 1660)]
E [/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.
Thread 1 "detector_app" received signal SIGABRT, Aborted.
__pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
44 pthread_kill.c: No such file or directory.
(gdb) bt
#0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
#1 0x0000fffff69c0568 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2 0x0000fffff697acd0 in __GI_raise (sig=sig@entry=6) at /usr/src/debug/glibc/2.38+git-r0/sysdeps/posix/raise.c:26
#3 0x0000fffff6966ef0 in __GI_abort () at abort.c:79
#4 0x0000fffff69743f8 in __assert_fail_base (fmt=0xfffff6a8a8e8 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0xfffff1ffdcf0 "false",
file=file@entry=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=line@entry=178,
function=function@entry=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:92
#5 0x0000fffff6974470 in __assert_fail (assertion=0xfffff1ffdcf0 "false", file=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=178,
function=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:101
#6 0x0000fffff1fa5f74 in tim::transform::OpLayoutInfer::MapAxis(std::vector<unsigned int, std::allocator<unsigned int> > const&, unsigned int) () from /usr/lib/libtim-vx.so
#7 0x0000fffff1f6a1b0 in ?? () from /usr/lib/libtim-vx.so
#8 0x0000fffff1f4e5f4 in tim::transform::layout_inference_impl::HandleLayoutInfer(std::shared_ptr<tim::transform::layout_inference_impl::LayoutInferContext>&, std::shared_ptr<tim::vx::Operation> const&) () from /usr/lib/libtim-vx.so
#9 0x0000fffff1f531f4 in tim::transform::LayoutInference(std::shared_ptr<tim::vx::Graph> const&, std::shared_ptr<tim::vx::Context>&, std::map<std::shared_ptr<tim::vx::Tensor>, std::shared_ptr<tim::transform::IPermuteVector>, std::less<std::shared_ptr<tim::vx::Tensor> >, std::allocator<std::pair<std::shared_ptr<tim::vx::Tensor> const, std::shared_ptr<tim::transform::IPermuteVector> > > >) () from /usr/lib/libtim-vx.so
#10 0x0000fffff23d85ac in vx::delegate::Delegate::Invoke(vx::delegate::OpData const&, TfLiteContext*, TfLiteNode*) () from /usr/lib/libvx_delegate.so
#11 0x0000fffff7be9d9c in tflite::Subgraph::InvokeImpl() () from /usr/lib/libtensorflow-lite.so.2.14.0
#12 0x0000fffff7bea388 in tflite::Subgraph::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#13 0x0000fffff7bd440c in tflite::impl::Interpreter::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#14 0x0000aaaaaaaa62e0 in Detector::detect (this=this@entry=0xfffffffff890, image=...)
at /home/ubuntu/imx-yocto-bsp/sdk/sysroots/armv8a-poky-linux/usr/include/c++/13.2.0/bits/unique_ptr.h:199
#15 0x0000aaaaaaaa35b0 in main (argc=<optimized out>, argv=<optimized out>) at /home/ubuntu/imx-yocto-bsp/tflite_test/build_minim/main.cpp:29
Does anyone have a clue what is wrong? because I am not sure what happened here. but what i only know that the assertion at op_layout_inference.cc:MapAxis:177 Map axis failed because of assertion error (?)
Thank you in advance
已解决! 转到解答。
I found the cause of the problem. Apparently this line caused the error:
resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());
So for now, i just disable it and magically it works. Maybe someone can explain why it trigger the error, but for now I can finally continue with my app development.
For the model, I check it using Python code, and apparently no error, so the model itself is compatible with the NPU run.
Thank you
Hello,
It looks tile the assertion is there to say that as far as you aware, has made it impossible to call the zero-args constructor according is private and so if a call occurs, that assertion has been violated per your error.
Regards
I found the cause of the problem. Apparently this line caused the error:
resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());
So for now, i just disable it and magically it works. Maybe someone can explain why it trigger the error, but for now I can finally continue with my app development.
For the model, I check it using Python code, and apparently no error, so the model itself is compatible with the NPU run.
Thank you