Fully working opi_rtsp on PC with YOLOv8 ONNX models

2026-04-22 14:46:34 +00:00 · 2024-07-10 18:37:33 +02:00
parent 896307d296
commit 3d39d8fd99
5 changed files with 272 additions and 220 deletions
@@ -1,162 +1,209 @@
 #include "inference.h"
-#include <algorithm>
-#include <iostream>

-const std::vector<std::string> InferenceEngine::CLASS_NAMES = {
-    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
-    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
-    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
-    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
-    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
-    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
-    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
-    "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
-    "scissors", "teddy bear", "hair drier", "toothbrush"};
-
-InferenceEngine::InferenceEngine(const std::string &model_path)
-    : env(ORT_LOGGING_LEVEL_WARNING, "ONNXRuntime"),
-      session_options(),
-      session(env, model_path.c_str(), session_options),
-      input_shape{1, 3, 640, 640}
+Inference::Inference(const std::string &onnxModelPath, const cv::Size &modelInputShape, const std::string &classesTxtFile, const bool &runWithCuda)
 {
-    session_options.SetIntraOpNumThreads(1);
-    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC);
+    modelPath = onnxModelPath;
+    modelShape = modelInputShape;
+    classesPath = classesTxtFile;
+    cudaEnabled = runWithCuda;
+
+    std::cout << "SIZE = " << modelInputShape.width << "x" << modelInputShape.height << std::endl;
+    loadOnnxNetwork();
+    // loadClassesFromFile(); The classes are hard-coded for this example
 }

-InferenceEngine::~InferenceEngine() {}
-
-/*
- * Function to preprocess the image
- *
- * @param image_path: path to the image
- * @param orig_width: original width of the image
- * @param orig_height: original height of the image
- *
- * @return: vector of floats representing the preprocessed image
- */
-std::vector<float> InferenceEngine::preprocessImage(const cv::Mat &image)
+std::vector<Detection> Inference::runInference(const cv::Mat &input)
 {
-    if (image.empty())
+    cv::Mat modelInput = input;
+    if (letterBoxForSquare && modelShape.width == modelShape.height)
+        modelInput = formatToSquare(modelInput);
+
+    cv::Mat blob;
+    cv::dnn::blobFromImage(modelInput, blob, 1.0/255.0, modelShape, cv::Scalar(), true, false);
+    net.setInput(blob);
+
+    std::vector<cv::Mat> outputs;
+    net.forward(outputs, net.getUnconnectedOutLayersNames());
+
+    int rows = outputs[0].size[1];
+    int dimensions = outputs[0].size[2];
+
+    bool yolov8 = false;
+    // yolov5 has an output of shape (batchSize, 25200, 85) (Num classes + box[x,y,w,h] + confidence[c])
+    // yolov8 has an output of shape (batchSize, 84,  8400) (Num classes + box[x,y,w,h])
+    if (dimensions > rows) // Check if the shape[2] is more than shape[1] (yolov8)
    {
-        throw std::runtime_error("Could not read the image");
+        yolov8 = true;
+        rows = outputs[0].size[2];
+        dimensions = outputs[0].size[1];
+
+        outputs[0] = outputs[0].reshape(1, dimensions);
+        cv::transpose(outputs[0], outputs[0]);
    }
+    float *data = (float *)outputs[0].data;

-    cv::Mat resized_image;
-    cv::resize(image, resized_image, cv::Size(input_shape[2], input_shape[3]));
+    float x_factor = modelInput.cols / modelShape.width;
+    float y_factor = modelInput.rows / modelShape.height;

-    resized_image.convertTo(resized_image, CV_32F, 1.0 / 255);
+    std::vector<int> class_ids;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> boxes;

-    std::vector<cv::Mat> channels(3);
-    cv::split(resized_image, channels);
-
-    std::vector<float> input_tensor_values;
-    for (int c = 0; c < 3; ++c)
+    for (int i = 0; i < rows; ++i)
    {
-        input_tensor_values.insert(input_tensor_values.end(), (float *)channels[c].data, (float *)channels[c].data + input_shape[2] * input_shape[3]);
-    }
-
-    return input_tensor_values;
-}
-
-/*
-    * Function to filter the detections based on the confidence threshold
-    *
-    * @param results: vector of floats representing the output tensor
-    * @param confidence_threshold: minimum confidence threshold
-    * @param img_width: width of the input image
-    * @param img_height: height of the input image
-    * @param orig_width: original width of the image
-    * @param orig_height: original height of the image
-    *
-    * @return: vector of Detection objects
-
-*/
-std::vector<Detection> InferenceEngine::filterDetections(const std::vector<float> &results, float confidence_threshold, int img_width, int img_height, int orig_width, int orig_height)
-{
-    std::vector<Detection> detections;
-    const int num_detections = results.size() / 6;
-
-    for (int i = 0; i < num_detections; ++i)
-    {
-        float left = results[i * 6 + 0];
-        float top = results[i * 6 + 1];
-        float right = results[i * 6 + 2];
-        float bottom = results[i * 6 + 3];
-        float confidence = results[i * 6 + 4];
-        int class_id = results[i * 6 + 5];
-
-        if (confidence >= confidence_threshold)
+        if (yolov8)
        {
-            int x = static_cast<int>(left * orig_width / img_width);
-            int y = static_cast<int>(top * orig_height / img_height);
-            int width = static_cast<int>((right - left) * orig_width / img_width);
-            int height = static_cast<int>((bottom - top) * orig_height / img_height);
+            float *classes_scores = data+4;

-            detections.push_back(
-                {confidence,
-                 cv::Rect(x, y, width, height),
-                 class_id,
-                 CLASS_NAMES[class_id]});
+            cv::Mat scores(1, classes.size(), CV_32FC1, classes_scores);
+            cv::Point class_id;
+            double maxClassScore;
+
+            minMaxLoc(scores, 0, &maxClassScore, 0, &class_id);
+
+            if (maxClassScore > modelScoreThreshold)
+            {
+                confidences.push_back(maxClassScore);
+                class_ids.push_back(class_id.x);
+
+                float x = data[0];
+                float y = data[1];
+                float w = data[2];
+                float h = data[3];
+
+                int left = int((x - 0.5 * w) * x_factor);
+                int top = int((y - 0.5 * h) * y_factor);
+
+                int width = int(w * x_factor);
+                int height = int(h * y_factor);
+
+                boxes.push_back(cv::Rect(left, top, width, height));
+            }
        }
+        else // yolov5
+        {
+            float confidence = data[4];
+
+            if (confidence >= modelConfidenceThreshold)
+            {
+                float *classes_scores = data+5;
+
+                cv::Mat scores(1, classes.size(), CV_32FC1, classes_scores);
+                cv::Point class_id;
+                double max_class_score;
+
+                minMaxLoc(scores, 0, &max_class_score, 0, &class_id);
+
+                if (max_class_score > modelScoreThreshold)
+                {
+                    confidences.push_back(confidence);
+                    class_ids.push_back(class_id.x);
+
+                    float x = data[0];
+                    float y = data[1];
+                    float w = data[2];
+                    float h = data[3];
+
+                    int left = int((x - 0.5 * w) * x_factor);
+                    int top = int((y - 0.5 * h) * y_factor);
+
+                    int width = int(w * x_factor);
+                    int height = int(h * y_factor);
+
+                    boxes.push_back(cv::Rect(left, top, width, height));
+                }
+            }
+        }
+
+        data += dimensions;
+    }
+
+    std::vector<int> nms_result;
+    cv::dnn::NMSBoxes(boxes, confidences, modelScoreThreshold, modelNMSThreshold, nms_result);
+
+    std::vector<Detection> detections{};
+    for (unsigned long i = 0; i < nms_result.size(); ++i)
+    {
+        int idx = nms_result[i];
+
+        Detection result;
+        result.class_id = class_ids[idx];
+        result.confidence = confidences[idx];
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_int_distribution<int> dis(100, 255);
+        result.color = cv::Scalar(dis(gen),
+                                  dis(gen),
+                                  dis(gen));
+
+        result.className = classes[result.class_id];
+        result.box = boxes[idx];
+
+        detections.push_back(result);
    }

    return detections;
 }

-
-/*
-    * Function to run inference
-    *
-    * @param input_tensor_values: vector of floats representing the input tensor
-    *
-    * @return: vector of floats representing the output tensor
-*/
-std::vector<float> InferenceEngine::runInference(const std::vector<float> &input_tensor_values)
+void Inference::loadClassesFromFile()
 {
-    Ort::AllocatorWithDefaultOptions allocator;
-
-    std::string input_name = getInputName();
-    std::string output_name = getOutputName();
-
-    const char *input_name_ptr = input_name.c_str();
-    const char *output_name_ptr = output_name.c_str();
-
-    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, const_cast<float *>(input_tensor_values.data()), input_tensor_values.size(), input_shape.data(), input_shape.size());
-
-    auto output_tensors = session.Run(Ort::RunOptions{nullptr}, &input_name_ptr, &input_tensor, 1, &output_name_ptr, 1);
-
-    float *floatarr = output_tensors[0].GetTensorMutableData<float>();
-    size_t output_tensor_size = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
-
-    return std::vector<float>(floatarr, floatarr + output_tensor_size);
+    std::ifstream inputFile(classesPath);
+    if (inputFile.is_open())
+    {
+        std::string classLine;
+        while (std::getline(inputFile, classLine))
+            classes.push_back(classLine);
+        inputFile.close();
+    }
 }

-/*
-    * Function to draw the labels on the image
-    *
-    * @param image: input image
-    * @param detections: vector of Detection objects
-    *
-    * @return: image with labels drawn
+void Inference::loadOnnxNetwork()
+{
+    printf("loadOnnxNetwork() starts\n");

-*/
-cv::Mat InferenceEngine::draw_labels(const cv::Mat &image, const std::vector<Detection> &detections)
+    net = cv::dnn::readNetFromONNX(modelPath);
+
+    if (cudaEnabled)
+    {
+        std::cout << "\nRunning on CUDA" << std::endl;
+        net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
+        net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA);
+    }
+    else
+    {
+        std::cout << "\nRunning on CPU" << std::endl;
+        net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
+        net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
+    }
+}
+
+cv::Mat Inference::formatToSquare(const cv::Mat &source)
+{
+    int col = source.cols;
+    int row = source.rows;
+    int _max = MAX(col, row);
+    cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
+    source.copyTo(result(cv::Rect(0, 0, col, row)));
+    return result;
+}
+
+cv::Mat Inference::drawLabels(const cv::Mat &image, const std::vector<Detection> &detections)
 {
    cv::Mat result = image.clone();

    for (const auto &detection : detections)
    {
-        cv::rectangle(result, detection.bbox, cv::Scalar(0, 255, 0), 2);
-        std::string label = detection.class_name + ": " + std::to_string(detection.confidence);
+        cv::rectangle(result, detection.box, cv::Scalar(0, 255, 0), 2);
+        std::string label = detection.className + ": " + std::to_string(detection.confidence);

        int baseLine;
        cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        cv::rectangle(
            result,
-            cv::Point(detection.bbox.x, detection.bbox.y - labelSize.height),
-            cv::Point(detection.bbox.x + labelSize.width, detection.bbox.y + baseLine),
+            cv::Point(detection.box.x, detection.box.y - labelSize.height),
+            cv::Point(detection.box.x + labelSize.width, detection.box.y + baseLine),
            cv::Scalar(255, 255, 255),
            cv::FILLED);

@@ -164,8 +211,8 @@ cv::Mat InferenceEngine::draw_labels(const cv::Mat &image, const std::vector<Det
            result,
            label,
            cv::Point(
-                detection.bbox.x,
-                detection.bbox.y),
+                detection.box.x,
+                detection.box.y),
            cv::FONT_HERSHEY_SIMPLEX,
            0.5,
            cv::Scalar(0, 0, 0),
@@ -174,27 +221,3 @@ cv::Mat InferenceEngine::draw_labels(const cv::Mat &image, const std::vector<Det

    return result;
 }
-
-/*
-    * Function to get the input name
-    *
-    * @return: name of the input tensor
-*/
-std::string InferenceEngine::getInputName()
-{
-    Ort::AllocatorWithDefaultOptions allocator;
-    Ort::AllocatedStringPtr name_allocator = session.GetInputNameAllocated(0, allocator);
-    return std::string(name_allocator.get());
-}
-
-/*
-    * Function to get the output name
-    *
-    * @return: name of the output tensor
-*/
-std::string InferenceEngine::getOutputName()
-{
-    Ort::AllocatorWithDefaultOptions allocator;
-    Ort::AllocatedStringPtr name_allocator = session.GetOutputNameAllocated(0, allocator);
-    return std::string(name_allocator.get());
-}