Hello everyone
Sorry for asking. I tried to build my own C++ program which is a converted program from python , to run a simple model to detect people for my board imx8m plus.
The Code:
1. main.cpp
// main.cpp
#include "detector.h"
#include <opencv2/opencv.hpp>
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <image_path>" << std::endl;
return 1;
}
std::string image_path = argv[1];
std::string model_path = "model.tflite";
std::string delegate_path = "/usr/lib/libvx_delegate.so";
cv::Size input_size(192, 192);
float score_th = 0.5;
float nms_th = 0.4;
Detector detector(model_path, delegate_path, input_size, score_th, nms_th);
if (!detector.init_model()) {
return 1;
}
cv::Mat image = cv::imread(image_path);
if (image.empty()) {
std::cerr << "Failed to load image from " << image_path << std::endl;
return 1;
}
auto [bboxes, scores] = detector.detect(image);
for (size_t i = 0; i < bboxes.size(); ++i) {
cv::rectangle(image, bboxes[i], cv::Scalar(0, 255, 0), 2);
std::cout << "Detected bbox: " << bboxes[i] << " with score: " << scores[i] << std::endl;
}
// cv::imshow("Detections", image);
cv::waitKey(0);
return 0;
}
2. detector.h
// detector.h
#ifndef DETECTOR_H
#define DETECTOR_H
#include <opencv2/opencv.hpp>
#include <tensorflow/lite/interpreter.h>
#include <tensorflow/lite/kernels/register.h>
#include <tensorflow/lite/model.h>
#include <tensorflow/lite/optional_debug_tools.h>
#include <tensorflow/lite/delegates/external/external_delegate.h>
#include <tensorflow-lite-vx-delegate/vsi_npu_custom_op.h>
#include "delegate_main.h"
class Detector {
public:
Detector(const std::string& model_path,
const std::string& delegate_path,
const cv::Size& input_shape,
float score_th,
float nms_th);
bool init_model();
std::pair<std::vector<cv::Rect>, std::vector<float>> detect(const cv::Mat& image);
private:
std::string model_path_;
std::string delegate_path_;
cv::Size input_shape_;
float score_th_;
float nms_th_;
std::unique_ptr<tflite::Interpreter> interpreter_;
std::pair<cv::Mat, float> preprocess(const cv::Mat& image, const cv::Size& input_size);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> postprocess(cv::Mat& outputs,
const cv::Size& img_size,
float ratio,
float score_th,
float nms_th);
void meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv);
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> nms(const std::vector<cv::Rect>& bboxes,
const std::vector<float>& scores,
float score_th,
float nms_th);
};
#endif // DETECTOR_H
3. detector.cpp
// detector.cpp
#include "detector.h"
#include <iostream>
Detector::Detector(const std::string& model_path,
const std::string& delegate_path,
const cv::Size& input_shape,
float score_th,
float nms_th)
: model_path_(model_path),
delegate_path_(delegate_path),
input_shape_(input_shape),
score_th_(score_th),
nms_th_(nms_th) {}
bool Detector::init_model() {
auto model = tflite::FlatBufferModel::BuildFromFile(model_path_.c_str());
if (!model) {
std::cerr << "Failed to load model from " << model_path_ << std::endl;
return false;
}
auto ext_delegate_option = TfLiteExternalDelegateOptionsDefault(delegate_path_.c_str());
auto ext_delegate_ptr = TfLiteExternalDelegateCreate(&ext_delegate_option);
if (!ext_delegate_ptr) {
std::cerr << "Failed to create external delegate" << std::endl;
return false;
}
tflite::ops::builtin::BuiltinOpResolver resolver;
resolver.AddCustom(kNbgCustomOp, tflite::ops::custom::Register_VSI_NPU_PRECOMPILED());
tflite::InterpreterBuilder builder(*model, resolver);
builder(&interpreter_);
if (!interpreter_) {
std::cerr << "Failed to build interpreter" << std::endl;
return false;
}
interpreter_->ModifyGraphWithDelegate(ext_delegate_ptr);
if (interpreter_->AllocateTensors() != kTfLiteOk) {
std::cerr << "Failed to allocate tensors" << std::endl;
return false;
}
return true;
}
std::pair<cv::Mat, float> Detector::preprocess(const cv::Mat& image, const cv::Size& input_size) {
float ratio = std::min(static_cast<float>(input_size.width) / image.cols,
static_cast<float>(input_size.height) / image.rows);
cv::Size new_size(static_cast<int>(image.cols * ratio), static_cast<int>(image.rows * ratio));
cv::Mat resized_image;
cv::resize(image, resized_image, new_size, 0, 0, cv::INTER_LINEAR);
cv::Mat padded_image = cv::Mat::ones(input_size, CV_8UC3) * 114;
resized_image.copyTo(padded_image(cv::Rect(0, 0, resized_image.cols, resized_image.rows)));
std::vector<cv::Mat> channels(3);
cv::split(padded_image, channels);
cv::Mat chw_image(3, input_size.height * input_size.width, CV_32F);
for(int i = 0; i < 3; ++i) {
channels[i].convertTo(channels[i], CV_32F);
std::memcpy(chw_image.ptr<float>(i), channels[i].data, channels[i].total() * sizeof(float));
}
cv::Mat reshaped_image = chw_image.reshape(1, {1, 3, input_size.height, input_size.width});
return std::make_pair(reshaped_image, ratio);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::postprocess(cv::Mat& outputs,
const cv::Size& img_size,
float ratio,
float score_th,
float nms_th) {
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> class_ids;
std::vector<int> strides = {8, 16, 32};
std::vector<cv::Mat> grids;
std::vector<cv::Mat> expanded_strides;
for (int stride : strides) {
int hsize = img_size.height / stride;
int wsize = img_size.width / stride;
cv::Mat xv, yv;
meshgrid(cv::Range(0, wsize - 1), cv::Range(0, hsize - 1), xv, yv);
cv::Mat grid;
cv::hconcat(xv.reshape(1, 1), yv.reshape(1, 1), grid);
grids.push_back(grid.reshape(2, 1));
expanded_strides.push_back(cv::Mat(grid.size(), CV_32F, cv::Scalar(stride)));
}
cv::Mat grid_cat, stride_cat;
cv::vconcat(grids, grid_cat);
cv::vconcat(expanded_strides, stride_cat);
outputs.colRange(2, 4).convertTo(outputs.colRange(2, 4), CV_32F);
cv::Mat exp_colRange(outputs.colRange(2, 4).size(), CV_32F);
cv::exp(outputs.colRange(2, 4), exp_colRange);
outputs.colRange(0, 2) = (outputs.colRange(0, 2) + grid_cat) * stride_cat;
outputs.colRange(2, 4) = exp_colRange.mul(stride_cat);
cv::Mat predictions = outputs.row(0);
cv::Mat bboxes_mat = predictions.colRange(0, 4);
cv::Mat scores_mat = predictions.col(4).mul(predictions.colRange(5, predictions.cols));
scores.assign(scores_mat.begin<float>(), scores_mat.end<float>());
std::vector<cv::Rect> bboxes_xyxy(bboxes_mat.rows);
for (int i = 0; i < bboxes_mat.rows; ++i) {
float x_center = bboxes_mat.at<float>(i, 0);
float y_center = bboxes_mat.at<float>(i, 1);
float width = bboxes_mat.at<float>(i, 2);
float height = bboxes_mat.at<float>(i, 3);
float x_min = x_center - width / 2.0;
float y_min = y_center - height / 2.0;
float x_max = x_center + width / 2.0;
float y_max = y_center + height / 2.0;
bboxes_xyxy[i] = cv::Rect(cv::Point(x_min / ratio, y_min / ratio), cv::Point(x_max / ratio, y_max / ratio));
}
return nms(bboxes_xyxy, scores, score_th, nms_th);
}
void Detector::meshgrid(const cv::Range& x_range, const cv::Range& y_range, cv::Mat& xv, cv::Mat& yv) {
cv::Mat x_coords = cv::Mat(x_range.size(), 1, CV_32F);
cv::Mat y_coords = cv::Mat(y_range.size(), 1, CV_32F);
for (int i = 0; i < x_range.size(); ++i) {
x_coords.at<float>(i,0) = x_range.start + i;
}
for (int i = 0; i < y_range.size(); ++i) {
y_coords.at<float>(i,0) = y_range.start + i;
}
cv::repeat(x_coords, 1, y_range.size(), xv);
cv::repeat(y_coords.t(), x_range.size(), 1, yv);
}
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>> Detector::nms(const std::vector<cv::Rect>& bboxes,
const std::vector<float>& scores,
float score_th,
float nms_th) {
std::vector<cv::Rect> bboxes_filtered;
std::vector<float> scores_filtered;
std::vector<int> class_ids_filtered;
std::vector<int> indices;
cv::dnn::NMSBoxes(bboxes, scores, score_th, nms_th, indices);
for(int idx : indices) {
bboxes_filtered.push_back(bboxes[idx]);
scores_filtered.push_back(scores[idx]);
class_ids_filtered.push_back(0);
}
return std::make_tuple(bboxes_filtered, scores_filtered, class_ids_filtered);
}
std::pair<std::vector<cv::Rect>, std::vector<float>> Detector::detect(const cv::Mat& image) {
cv::Mat temp_image = image.clone();
auto [preprocessed_image, ratio] = preprocess(temp_image, input_shape_);
std::cout << "Preprocess Completed"<<std::endl;
// Setting input tensor
TfLiteTensor* input_data = interpreter_->tensor(interpreter_->inputs()[0]);
const uint input_width = input_data->dims->data[3];
const uint input_height = input_data->dims->data[2];
const uint input_channels = input_data->dims->data[1];
const uint batch_size = input_data->dims->data[0];
std::cout << "Expected dimension: "<< batch_size << "x" << input_channels << "x" << input_height << "x" << input_width << std::endl;
const uint image_width = preprocessed_image.size[3];
const uint image_height = preprocessed_image.size[2];
const uint image_channels = preprocessed_image.size[1];
const uint image_batch_size = preprocessed_image.size[0];
std::cout << "Image dimension: "<< image_batch_size << "x" << image_channels << "x" << image_height << "x" << image_width << std::endl;
if(input_data->type !=kTfLiteFloat32){
std::cerr << "input tensor is not of type float" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
if(input_data->data.f == nullptr) {
std::cerr << "input tensor data pointer is null" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
std::memcpy(input_data->data.f, preprocessed_image.ptr<float>(0), batch_size * input_width * input_height * input_channels * sizeof(float));
if(memcmp(input_data->data.f, preprocessed_image.ptr<float>(0),batch_size * input_width * input_height * input_channels * sizeof(float)) != 0){
std::cerr << "data copy to input tensor failed" << std::endl;
return std::make_pair(std::vector<cv::Rect>(), std::vector<float>());
}
else{
std::cout << "Set up Input Tensor Completed"<<std::endl;
}
// Running inference
interpreter_->Invoke();
std::cout << "Inference Completed"<<std::endl;
// Getting output tensor
float* output_tensor = interpreter_->typed_output_tensor<float>(0);
size_t output_size = interpreter_->tensor(interpreter_->outputs()[0])->bytes / sizeof(float);
cv::Mat results(1, output_size, CV_32F, output_tensor);
std::cout << "Get Results Completed"<<std::endl;
// Postprocessing
auto [bboxes_xyxy, scores, class_ids] = postprocess(results, input_shape_, ratio, score_th_, nms_th_);
// Converting the bboxes to cv::Rect and packing results
std::vector<cv::Rect> result_rect_list;
for (size_t i = 0; i < bboxes_xyxy.size(); ++i) {
result_rect_list.push_back(bboxes_xyxy[i]);
}
// Returning the list of rectangles and the associated scores
return {result_rect_list, scores};
}
my board image is nanbield 6.6.3_1.0.0 full image
I tried to run it using VX Delegate and NPU and encounter a problem when running the code
root@imx8mpevk:/run/media/SD CARD-sda1/test_npu# ./detector_app lena_color_512.tif
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
E [/usr/src/debug/tim-vx/1.1.88-r[ 126.612163] audit: type=1701 audit(1695250801.923:18): auid=4294967295 uid=0 gid=0 ses=4294967295 pid=1270 comm="detector_app" exe=2F72756E2F6D656469612F534420434152442D736461312F746573745F6E70752F6465746563746F725F617070 sig=6 res=1
0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.
Aborted (core dumped)
I also tried to get the gdb debug running, and it return something like this:
(gdb) set args lena_color_512.tif
(gdb) run
Starting program: /run/media/SD CARD-sda1/test_npu/detector_app lena_color_512.tif
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
INFO: Vx delegate: allowed_cache_mode set to 0.
INFO: Vx delegate: device num set to 0.
INFO: Vx delegate: allowed_builtin_code set to 0.
INFO: Vx delegate: error_during_init set to 0.
INFO: Vx delegate: error_during_prepare set to 0.
INFO: Vx delegate: error_during_invoke set to 0.
Preprocess Completed
Expected dimension: 1x3x192x192
Image dimension: 1x3x192x192
Set up Input Tensor Completed
[New Thread 0xfffff146cf00 (LWP 1660)]
E [/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:MapAxis:177]Map axis failed.
detector_app: /usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc:178: uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t): Assertion `false' failed.
Thread 1 "detector_app" received signal SIGABRT, Aborted.
__pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
44 pthread_kill.c: No such file or directory.
(gdb) bt
#0 __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
#1 0x0000fffff69c0568 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2 0x0000fffff697acd0 in __GI_raise (sig=sig@entry=6) at /usr/src/debug/glibc/2.38+git-r0/sysdeps/posix/raise.c:26
#3 0x0000fffff6966ef0 in __GI_abort () at abort.c:79
#4 0x0000fffff69743f8 in __assert_fail_base (fmt=0xfffff6a8a8e8 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0xfffff1ffdcf0 "false",
file=file@entry=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=line@entry=178,
function=function@entry=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:92
#5 0x0000fffff6974470 in __assert_fail (assertion=0xfffff1ffdcf0 "false", file=0xfffff1fff568 "/usr/src/debug/tim-vx/1.1.88-r0/src/tim/transform/ops/op_layout_inference.cc", line=178,
function=0xfffff1fff5d8 "uint32_t tim::transform::OpLayoutInfer::MapAxis(const std::vector<unsigned int>&, uint32_t)") at assert.c:101
#6 0x0000fffff1fa5f74 in tim::transform::OpLayoutInfer::MapAxis(std::vector<unsigned int, std::allocator<unsigned int> > const&, unsigned int) () from /usr/lib/libtim-vx.so
#7 0x0000fffff1f6a1b0 in ?? () from /usr/lib/libtim-vx.so
#8 0x0000fffff1f4e5f4 in tim::transform::layout_inference_impl::HandleLayoutInfer(std::shared_ptr<tim::transform::layout_inference_impl::LayoutInferContext>&, std::shared_ptr<tim::vx::Operation> const&) () from /usr/lib/libtim-vx.so
#9 0x0000fffff1f531f4 in tim::transform::LayoutInference(std::shared_ptr<tim::vx::Graph> const&, std::shared_ptr<tim::vx::Context>&, std::map<std::shared_ptr<tim::vx::Tensor>, std::shared_ptr<tim::transform::IPermuteVector>, std::less<std::shared_ptr<tim::vx::Tensor> >, std::allocator<std::pair<std::shared_ptr<tim::vx::Tensor> const, std::shared_ptr<tim::transform::IPermuteVector> > > >) () from /usr/lib/libtim-vx.so
#10 0x0000fffff23d85ac in vx::delegate::Delegate::Invoke(vx::delegate::OpData const&, TfLiteContext*, TfLiteNode*) () from /usr/lib/libvx_delegate.so
#11 0x0000fffff7be9d9c in tflite::Subgraph::InvokeImpl() () from /usr/lib/libtensorflow-lite.so.2.14.0
#12 0x0000fffff7bea388 in tflite::Subgraph::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#13 0x0000fffff7bd440c in tflite::impl::Interpreter::Invoke() () from /usr/lib/libtensorflow-lite.so.2.14.0
#14 0x0000aaaaaaaa62e0 in Detector::detect (this=this@entry=0xfffffffff890, image=...)
at /home/ubuntu/imx-yocto-bsp/sdk/sysroots/armv8a-poky-linux/usr/include/c++/13.2.0/bits/unique_ptr.h:199
#15 0x0000aaaaaaaa35b0 in main (argc=<optimized out>, argv=<optimized out>) at /home/ubuntu/imx-yocto-bsp/tflite_test/build_minim/main.cpp:29
Does anyone have a clue what is wrong? because I am not sure what happened here. but what i only know that the assertion at op_layout_inference.cc:MapAxis:177 Map axis failed because of assertion error (?)
Thank you in advance