diff --git a/misc/opencv-dnn-onnx-inference/.gitignore b/misc/opencv-dnn-onnx-inference/.gitignore new file mode 100644 index 0000000..4a0b530 --- /dev/null +++ b/misc/opencv-dnn-onnx-inference/.gitignore @@ -0,0 +1,74 @@ +# This file is used to ignore files which are generated +# ---------------------------------------------------------------------------- + +*~ +*.autosave +*.a +*.core +*.moc +*.o +*.obj +*.orig +*.rej +*.so +*.so.* +*_pch.h.cpp +*_resource.rc +*.qm +.#* +*.*# +core +!core/ +tags +.DS_Store +.directory +*.debug +Makefile* +*.prl +*.app +moc_*.cpp +ui_*.h +qrc_*.cpp +Thumbs.db +*.res +*.rc +/.qmake.cache +/.qmake.stash + +# qtcreator generated files +*.pro.user* +CMakeLists.txt.user* + +# xemacs temporary files +*.flc + +# Vim temporary files +.*.swp + +# Visual Studio generated files +*.ib_pdb_index +*.idb +*.ilk +*.pdb +*.sln +*.suo +*.vcproj +*vcproj.*.*.user +*.ncb +*.sdf +*.opensdf +*.vcxproj +*vcxproj.* + +# MinGW generated files +*.Debug +*.Release + +# Python byte code +*.pyc + +# Binaries +# -------- +*.dll +*.exe + diff --git a/misc/opencv-dnn-onnx-inference/main-opencv-example.cpp b/misc/opencv-dnn-onnx-inference/main-opencv-example.cpp new file mode 100644 index 0000000..fc469c4 --- /dev/null +++ b/misc/opencv-dnn-onnx-inference/main-opencv-example.cpp @@ -0,0 +1,387 @@ +/** + * @file yolo_detector.cpp + * @brief Yolo Object Detection Sample + * @author OpenCV team + */ + +//![includes] +#include +#include +#include +#include +#include +#include "iostream" +#include "common.hpp" +#include +//![includes] + +using namespace cv; +using namespace cv::dnn; + +void getClasses(std::string classesFile); +void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame); +void yoloPostProcessing( + std::vector& outs, + std::vector& keep_classIds, + std::vector& keep_confidences, + std::vector& keep_boxes, + float conf_threshold, + float iou_threshold, + const std::string& model_name, + const int nc + ); + +std::vector classes; + + +std::string keys = + "{ help h | | Print help message. }" + "{ device | 0 | camera device number. }" + "{ model | onnx/models/yolox_s_inf_decoder.onnx | Default model. }" + "{ yolo | yolox | yolo model version. }" + "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera. }" + "{ classes | | Optional path to a text file with names of classes to label detected objects. }" + "{ nc | 80 | Number of classes. Default is 80 (coming from COCO dataset). }" + "{ thr | .5 | Confidence threshold. }" + "{ nms | .4 | Non-maximum suppression threshold. }" + "{ mean | 0.0 | Normalization constant. }" + "{ scale | 1.0 | Preprocess input image by multiplying on a scale factor. }" + "{ width | 640 | Preprocess input image by resizing to a specific width. }" + "{ height | 640 | Preprocess input image by resizing to a specific height. }" + "{ rgb | 1 | Indicate that model works with RGB input images instead BGR ones. }" + "{ padvalue | 114.0 | padding value. }" + "{ paddingmode | 2 | Choose one of computation backends: " + "0: resize to required input size without extra processing, " + "1: Image will be cropped after resize, " + "2: Resize image to the desired size while preserving the aspect ratio of original image }" + "{ backend | 0 | Choose one of computation backends: " + "0: automatically (by default), " + "1: Halide language (http://halide-lang.org/), " + "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), " + "3: OpenCV implementation, " + "4: VKCOM, " + "5: CUDA }" + "{ target | 0 | Choose one of target computation devices: " + "0: CPU target (by default), " + "1: OpenCL, " + "2: OpenCL fp16 (half-float precision), " + "3: VPU, " + "4: Vulkan, " + "6: CUDA, " + "7: CUDA fp16 (half-float preprocess) }" + "{ async | 0 | Number of asynchronous forwards at the same time. " + "Choose 0 for synchronous mode }"; + +void getClasses(std::string classesFile) +{ + std::ifstream ifs(classesFile.c_str()); + if (!ifs.is_open()) + CV_Error(Error::StsError, "File " + classesFile + " not found"); + std::string line; + while (std::getline(ifs, line)) + classes.push_back(line); +} + +void drawPrediction(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) +{ + rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0)); + + std::string label = format("%.2f", conf); + if (!classes.empty()) + { + CV_Assert(classId < (int)classes.size()); + label = classes[classId] + ": " + label; + } + + int baseLine; + Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + top = max(top, labelSize.height); + rectangle(frame, Point(left, top - labelSize.height), + Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED); + putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar()); +} + +void yoloPostProcessing( + std::vector& outs, + std::vector& keep_classIds, + std::vector& keep_confidences, + std::vector& keep_boxes, + float conf_threshold, + float iou_threshold, + const std::string& model_name, + const int nc=80) +{ + // Retrieve + std::vector classIds; + std::vector confidences; + std::vector boxes; + + if (model_name == "yolov8" || model_name == "yolov10" || + model_name == "yolov9") + { + cv::transposeND(outs[0], {0, 2, 1}, outs[0]); + } + + if (model_name == "yolonas") + { + // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84] + Mat concat_out; + // squeeze the first dimension + outs[0] = outs[0].reshape(1, outs[0].size[1]); + outs[1] = outs[1].reshape(1, outs[1].size[1]); + cv::hconcat(outs[1], outs[0], concat_out); + outs[0] = concat_out; + // remove the second element + outs.pop_back(); + // unsqueeze the first dimension + outs[0] = outs[0].reshape(0, std::vector{1, 8400, nc + 4}); + } + + // assert if last dim is 85 or 84 + CV_CheckEQ(outs[0].dims, 3, "Invalid output shape. The shape should be [1, #anchors, 85 or 84]"); + CV_CheckEQ((outs[0].size[2] == nc + 5 || outs[0].size[2] == 80 + 4), true, "Invalid output shape: "); + + for (auto preds : outs) + { + preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85] + for (int i = 0; i < preds.rows; ++i) + { + // filter out non object + float obj_conf = (model_name == "yolov8" || model_name == "yolonas" || + model_name == "yolov9" || model_name == "yolov10") ? 1.0f : preds.at(i, 4) ; + if (obj_conf < conf_threshold) + continue; + + Mat scores = preds.row(i).colRange((model_name == "yolov8" || model_name == "yolonas" || model_name == "yolov9" || model_name == "yolov10") ? 4 : 5, preds.cols); + double conf; + Point maxLoc; + minMaxLoc(scores, 0, &conf, 0, &maxLoc); + + conf = (model_name == "yolov8" || model_name == "yolonas" || model_name == "yolov9" || model_name == "yolov10") ? conf : conf * obj_conf; + if (conf < conf_threshold) + continue; + + // get bbox coords + float* det = preds.ptr(i); + double cx = det[0]; + double cy = det[1]; + double w = det[2]; + double h = det[3]; + + // [x1, y1, x2, y2] + if (model_name == "yolonas" || model_name == "yolov10"){ + boxes.push_back(Rect2d(cx, cy, w, h)); + } else { + boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h, + cx + 0.5 * w, cy + 0.5 * h)); + } + classIds.push_back(maxLoc.x); + confidences.push_back(static_cast(conf)); + } + } + + // NMS + std::vector keep_idx; + NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, keep_idx); + + for (auto i : keep_idx) + { + keep_classIds.push_back(classIds[i]); + keep_confidences.push_back(confidences[i]); + keep_boxes.push_back(boxes[i]); + } +} + +/** + * @function main + * @brief Main function + */ +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, keys); + parser.about("Use this script to run object detection deep learning networks using OpenCV."); + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + CV_Assert(parser.has("model")); + CV_Assert(parser.has("yolo")); + // if model is default, use findFile to get the full path otherwise use the given path + std::string weightPath = "yolov8n.onnx"; + std::string yolo_model = "yolov8"; + int nc = parser.get("nc"); + + float confThreshold = parser.get("thr"); + float nmsThreshold = parser.get("nms"); + //![preprocess_params] + float paddingValue = parser.get("padvalue"); + bool swapRB = parser.get("rgb"); + int inpWidth = parser.get("width"); + int inpHeight = parser.get("height"); + Scalar scale = parser.get("scale"); + Scalar mean = parser.get("mean"); + ImagePaddingMode paddingMode = static_cast(parser.get("paddingmode")); + //![preprocess_params] + + // check if yolo model is valid + if (yolo_model != "yolov5" && yolo_model != "yolov6" + && yolo_model != "yolov7" && yolo_model != "yolov8" + && yolo_model != "yolov10" && yolo_model !="yolov9" + && yolo_model != "yolox" && yolo_model != "yolonas") + CV_Error(Error::StsError, "Invalid yolo model: " + yolo_model); + + // get classes + if (parser.has("classes")) + { + getClasses(findFile(parser.get("classes"))); + } + + // load model + //![read_net] + Net net = readNet(weightPath); + int backend = parser.get("backend"); + net.setPreferableBackend(backend); + net.setPreferableTarget(parser.get("target")); + //![read_net] + + VideoCapture cap; + Mat img; + bool isImage = false; + bool isCamera = false; + + isImage = true; + img = imread("bus.png"); + + /* + // Check if input is given + if (parser.has("input")) + { + String input = parser.get("input"); + // Check if the input is an image + if (input.find(".jpg") != String::npos || input.find(".png") != String::npos) + { + img = imread(findFile(input)); + if (img.empty()) + { + CV_Error(Error::StsError, "Cannot read image file: " + input); + } + isImage = true; + } + else + { + cap.open(input); + if (!cap.isOpened()) + { + CV_Error(Error::StsError, "Cannot open video " + input); + } + isCamera = true; + } + } + else + { + int cameraIndex = parser.get("device"); + cap.open(cameraIndex); + if (!cap.isOpened()) + { + CV_Error(Error::StsError, cv::format("Cannot open camera #%d", cameraIndex)); + } + isCamera = true; + } + */ + + // image pre-processing + //![preprocess_call] + Size size(inpWidth, inpHeight); + Image2BlobParams imgParams( + scale, + size, + mean, + swapRB, + CV_32F, + DNN_LAYOUT_NCHW, + paddingMode, + paddingValue); + + // rescale boxes back to original image + Image2BlobParams paramNet; + paramNet.scalefactor = scale; + paramNet.size = size; + paramNet.mean = mean; + paramNet.swapRB = swapRB; + paramNet.paddingmode = paddingMode; + //![preprocess_call] + + //![forward_buffers] + std::vector outs; + std::vector keep_classIds; + std::vector keep_confidences; + std::vector keep_boxes; + std::vector boxes; + //![forward_buffers] + + Mat inp; + while (waitKey(1) < 0) + { + + if (isCamera) + cap >> img; + if (img.empty()) + { + std::cout << "Empty frame" << std::endl; + waitKey(); + break; + } + //![preprocess_call_func] + inp = blobFromImageWithParams(img, imgParams); + //![preprocess_call_func] + + //![forward] + net.setInput(inp); + net.forward(outs, net.getUnconnectedOutLayersNames()); + //![forward] + + //![postprocess] + yoloPostProcessing( + outs, keep_classIds, keep_confidences, keep_boxes, + confThreshold, nmsThreshold, + yolo_model, + nc); + //![postprocess] + + // covert Rect2d to Rect + //![draw_boxes] + for (auto box : keep_boxes) + { + boxes.push_back(Rect(cvFloor(box.x), cvFloor(box.y), cvFloor(box.width - box.x), cvFloor(box.height - box.y))); + } + + paramNet.blobRectsToImageRects(boxes, boxes, img.size()); + + for (size_t idx = 0; idx < boxes.size(); ++idx) + { + Rect box = boxes[idx]; + drawPrediction(keep_classIds[idx], keep_confidences[idx], box.x, box.y, + box.width + box.x, box.height + box.y, img); + } + + const std::string kWinName = "Yolo Object Detector"; + namedWindow(kWinName, WINDOW_NORMAL); + imshow(kWinName, img); + //![draw_boxes] + + outs.clear(); + keep_classIds.clear(); + keep_confidences.clear(); + keep_boxes.clear(); + boxes.clear(); + + if (isImage) + { + waitKey(); + break; + } + } +} diff --git a/misc/opencv-dnn-onnx-inference/main.cpp b/misc/opencv-dnn-onnx-inference/main.cpp new file mode 100644 index 0000000..560df73 --- /dev/null +++ b/misc/opencv-dnn-onnx-inference/main.cpp @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include + +#define CONFIDENCE_THRESHOLD 0.2 + +using namespace std; +using namespace cv; + +// function to create vector of class names +std::vector createClaseNames() { + std::vector classNames; + classNames.push_back("background"); + classNames.push_back("aeroplane"); + classNames.push_back("bicycle"); + classNames.push_back("bird"); + classNames.push_back("boat"); + classNames.push_back("bottle"); + classNames.push_back("bus"); + classNames.push_back("car"); + classNames.push_back("cat"); + classNames.push_back("chair"); + classNames.push_back("cow"); + classNames.push_back("diningtable"); + classNames.push_back("dog"); + classNames.push_back("horse"); + classNames.push_back("motorbike"); + classNames.push_back("person"); + classNames.push_back("pottedplant"); + classNames.push_back("sheep"); + return classNames; +}; + +int main(int argc, char *argv[]) +{ + if (argc < 3) { + qDebug() << "Give ONNX model as first argument and image as second one."; + return 1; + } + std::vector classNames = createClaseNames(); + + std::string model_filename = argv[1]; + std::string image_filename = argv[2]; + + // Load the ONNX model + cv::dnn::Net net = cv::dnn::readNetFromONNX(model_filename); + + // Set backend to OpenCL + net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT); + //net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL); + + // Load an image and preprocess it + cv::Mat image = cv::imread(image_filename); + cv::Mat blob = cv::dnn::blobFromImage(image, 1/255.0, cv::Size(640, 640), cv::Scalar(), true, false); + cv::imwrite("tmpInput.png", image); + + // Set the input for the network + net.setInput(blob); + std::vector outputs; + net.forward(outputs); + cv::Mat output = outputs[0]; + + std::vector layerNames = net.getLayerNames(); + std::vector outLayerIndices = net.getUnconnectedOutLayers(); + for (int index : outLayerIndices) { + std::cout << layerNames[index - 1] << std::endl; + } + + + cv::Mat detections = net.forward("output0"); + + // print some information about detections + std::cout << "dims: " << outputs[0].dims << std::endl; + std::cout << "size: " << outputs[0].size << std::endl; + + int num_detections0 = outputs.size(); + int num_classes0 = outputs[0].size[1]; + int rows = outputs[0].size[1]; + qDebug() << "num_detections0:" << num_detections0 << "num_classes0:" << num_classes0 << "rows:" << rows; + + int num_detections = detections.size[2]; // 8400 + int num_attributes = detections.size[1]; // 14 + qDebug() << "num_detections:" << num_detections << "num_attributes:" << num_attributes; + + + detections = detections.reshape(1, num_detections); + + for (int i = 0; i < num_detections; i++) { + Mat row = detections.row(i); + float box_confidence = row.at(4); + float cls_confidence = row.at(5); + if (box_confidence > 0.5 && cls_confidence > 0.5) { + std::cout << "box_confidence " << box_confidence << " cls_confidence: " << cls_confidence << std::endl; + } + } + + + + + /* + // Output tensor processing + // Assuming output tensor has shape [1, 14, 8400] + int num_detections = detections.size[2]; // 8400 + int num_attributes = detections.size[1]; // 14 + qDebug() << "num_detections:" << num_detections << "num_attributes:" << num_attributes; + + // Extract and print confidence for each detection + const float* data = (float*)output.data; + for (int i = 0; i < num_detections; i++) { + // Confidence value is at index 4 (based on YOLO-like model output format) + float confidence = data[i * num_attributes + 3]; + // Print confidence value + if (confidence > 0.5f) { + std::cout << "Detection " << i << " confidence: " << confidence << std::endl; + } + } + */ + + /* + // Assuming outputs is a vector of cv::Mat containing the model's output + for (int i = 0; i < outputs[0].size[0]; ++i) { + for (int j = 0; j < outputs[0].size[1]; ++j) { + float confidence = outputs[0].at(i, j); + // Print or use the confidence value + std::cout << "Confidence: " << confidence << std::endl; + } + } + */ + + /* + // Output processing variables + std::vector class_ids; + std::vector confidences; + std::vector boxes; + + // Analyze each output + for (const auto& output : outputs) { + // Each row is a detection: [center_x, center_y, width, height, conf, class1_score, class2_score, ...] + const auto* data = (float*)output.data; + + for (int i = 0; i < output.rows; i++, data += output.cols) { + float confidence = data[4]; // Objectness confidence + + if (confidence >= 0.5) { // Apply a confidence threshold + cv::Mat scores = output.row(i).colRange(5, output.cols); + cv::Point class_id_point; + double max_class_score; + + cv::minMaxLoc(scores, 0, &max_class_score, 0, &class_id_point); + int class_id = class_id_point.x; + + if (max_class_score >= 0.5) { // Apply a class score threshold + // Get bounding box + int center_x = (int)(data[0] * image.cols); + int center_y = (int)(data[1] * image.rows); + int width = (int)(data[2] * image.cols); + int height = (int)(data[3] * image.rows); + int left = center_x - width / 2; + int top = center_y - height / 2; + + // Store the results + class_ids.push_back(class_id); + confidences.push_back(confidence); + boxes.push_back(cv::Rect(left, top, width, height)); + } + } + } + } + */ + + /* + for (int i = 0; i < num_detections; ++i) { + for (int a = 0; a < 10; a++) { + qDebug() << "confidence:" << confidence << "class_id" << class_id; + } + //float confidence = outputs[0].at(i, 4); + //int class_id = outputs[0].at(i, 5); + qDebug() << "confidence:" << confidence << "class_id" << class_id; + } + */ + + /* + std::vector class_ids; + std::vector confidences; + std::vector boxes; + // Output tensor processing + for (size_t i = 0; i < outputs.size(); i++) { + float* data = (float*)outputs[i].data; + for (int j = 0; j < outputs[i].rows; ++j) { + float confidence = data[4]; // Objectness confidence + qDebug() <<" Confidence: " << confidence; + + if (confidence >= CONFIDENCE_THRESHOLD) { + // Get class with the highest score + cv::Mat scores = outputs[i].row(j).colRange(5, num_classes + 5); + cv::Point class_id_point; + double max_class_score; + cv::minMaxLoc(scores, 0, &max_class_score, 0, &class_id_point); + + qDebug() << "Class ID: " << class_id_point.x << " Confidence: " << confidence; + + if (max_class_score > CONFIDENCE_THRESHOLD) { + int center_x = (int)(data[0] * image.cols); + int center_y = (int)(data[1] * image.rows); + int width = (int)(data[2] * image.cols); + int height = (int)(data[3] * image.rows); + int left = center_x - width / 2; + int top = center_y - height / 2; + + // Store the results + class_ids.push_back(class_id_point.x); + confidences.push_back((float)max_class_score); + boxes.push_back(cv::Rect(left, top, width, height)); + } + } + data += num_classes + 5; // Move to the next detection + } + } + */ + + /* + // Process the output + for (int i = 0; i < num_detections; ++i) { + float confidence = outputs[0].at(i, 4); + int class_id = outputs[0].at(i, 5); + // ... extract bounding box coordinates + float x_center = outputs[0].at(i, 0); + float y_center = outputs[0].at(i, 1); + float width = outputs[0].at(i, 2); + float height = outputs[0].at(i, 3); + + // Calculate bounding box corners + int x = (int)(x_center - width / 2.0); + int y = (int)(y_center - height / 2.0); + int width_int = (int)width; + int height_int = (int)height; + + // Print or use the detected information + qDebug() << "Class ID: " << class_id << " Confidence: " << confidence << " Bounding Box: (" << x << ", " << y << ", " << width_int << ", " << height_int << ")"; + } + */ + + + /* + // Perform forward pass + for (int i = 0; i < 10; i++) { + std::vector outputs; + QElapsedTimer timer; + timer.start(); + net.forward(outputs); + qDebug() << "Inference completed in" << timer.elapsed() << "ms."; + } + */ + + // Process the output (YOLO-specific post-processing like NMS is needed) + // Outputs contain class scores, bounding boxes, etc. + + + return 0; +} diff --git a/misc/opencv-dnn-onnx-inference/opencv-dnn-onnx-inference.pro b/misc/opencv-dnn-onnx-inference/opencv-dnn-onnx-inference.pro new file mode 100644 index 0000000..246e760 --- /dev/null +++ b/misc/opencv-dnn-onnx-inference/opencv-dnn-onnx-inference.pro @@ -0,0 +1,16 @@ +QT = core +QT -= gui +CONFIG += c++17 cmdline concurrent console +# link_pkgconfig + +SOURCES += \ + main-opencv-example.cpp + #main.cpp + +HEADERS += \ + common.hpp + + +INCLUDEPATH += /opt/opencv-4.10.0/include/opencv4/ +LIBS += /opt/opencv-4.10.0/lib/libopencv_world.so +QMAKE_LFLAGS += -Wl,-rpath,/opt/opencv-4.10.0/lib diff --git a/misc/rtsp_ai_player/src-onnx-runtime/aiengineinferenceonnxruntime.cpp b/misc/rtsp_ai_player/src-onnx-runtime/aiengineinferenceonnxruntime.cpp index 8f7d343..ab74d32 100644 --- a/misc/rtsp_ai_player/src-onnx-runtime/aiengineinferenceonnxruntime.cpp +++ b/misc/rtsp_ai_player/src-onnx-runtime/aiengineinferenceonnxruntime.cpp @@ -90,6 +90,9 @@ void AiEngineInferencevOnnxRuntime::performInferenceSlot(cv::Mat frame) qDebug() << "performInferenceSlot() invalid classId =" << detection.classId; continue; } + else { + cv::imwrite("scaledImage.png", scaledImage); + } // Add detected objects to the results AiEngineObject object; diff --git a/misc/rtsp_ai_player/src-opencv-onnx/inference.cpp b/misc/rtsp_ai_player/src-opencv-onnx/inference.cpp index 0ef6aca..e95f8a1 100644 --- a/misc/rtsp_ai_player/src-opencv-onnx/inference.cpp +++ b/misc/rtsp_ai_player/src-opencv-onnx/inference.cpp @@ -28,11 +28,13 @@ std::vector Inference::runInference(const cv::Mat &input) int rows = outputs[0].size[1]; int dimensions = outputs[0].size[2]; - bool yolov8 = false; + bool yolov8 = true; // yolov5 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c]) // yolov8 has an output of shape (batchSize, 84, 8400) (Num classes + box[x,y,w,h]) + /* if (dimensions > rows) // Check if the shape[2] is more than shape[1] (yolov8) { + std::cout << "yolov8 = " << yolov8 << std::endl; yolov8 = true; rows = outputs[0].size[2]; dimensions = outputs[0].size[1]; @@ -40,6 +42,8 @@ std::vector Inference::runInference(const cv::Mat &input) outputs[0] = outputs[0].reshape(1, dimensions); cv::transpose(outputs[0], outputs[0]); } + */ + float *data = (float *)outputs[0].data; float x_factor = modelInput.cols / modelShape.width;