Initial object recognition for an image, video and RTSP stream with YOLOv10.

2026-06-21 09:11:10 +00:00 · 2024-06-19 20:33:46 +02:00
parent db2652d979
commit 0c37aa6116
7 changed files with 495 additions and 0 deletions
@@ -0,0 +1,61 @@
+cmake_minimum_required(VERSION 3.10)
+
+# Set the project name in a variable
+set(project_name yolov10_cpp)
+project(${project_name})
+set(CMAKE_CXX_STANDARD 17)
+
+find_package(OpenCV REQUIRED)
+
+# Find ONNX Runtime package
+find_path(ONNXRUNTIME_INCLUDE_DIR onnxruntime_c_api.h
+    HINTS /opt/onnxruntime-linux-x64-rocm-1.18.0/include
+)
+find_library(ONNXRUNTIME_LIBRARY onnxruntime
+    HINTS /opt/onnxruntime-linux-x64-rocm-1.18.0/lib
+)
+
+if(NOT ONNXRUNTIME_INCLUDE_DIR)
+    message(FATAL_ERROR "ONNX Runtime include directory not found")
+endif()
+if(NOT ONNXRUNTIME_LIBRARY)
+    message(FATAL_ERROR "ONNX Runtime library not found")
+endif()
+
+add_library(${project_name}-lib
+    src/placeholder.cpp
+    src/ia/inference.cpp
+    src/ia/inference.h
+)
+
+target_include_directories(${project_name}-lib PUBLIC src)
+target_include_directories(${project_name}-lib PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+
+target_link_libraries(${project_name}-lib
+    PUBLIC ${OpenCV_LIBS}
+    PUBLIC ${ONNXRUNTIME_LIBRARY}
+)
+
+# Add the main executable
+add_executable(${project_name} 
+    ./src/main.cpp
+)
+target_include_directories(${project_name} PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+target_link_libraries(${project_name} ${project_name}-lib)
+
+# Add the video executable
+add_executable(${project_name}_video 
+    ./src/video.cpp
+)
+
+# Add the video executable
+add_executable(${project_name}_video_rtsp 
+    ./src/video_rtsp.cpp
+)
+
+target_include_directories(${project_name}_video PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+target_link_libraries(${project_name}_video ${project_name}-lib)
+
+target_include_directories(${project_name}_video_rtsp PUBLIC ${ONNXRUNTIME_INCLUDE_DIR})
+target_link_libraries(${project_name}_video_rtsp ${project_name}-lib)
+
@@ -0,0 +1,200 @@
+#include "inference.h"
+#include <algorithm>
+#include <iostream>
+
+const std::vector<std::string> InferenceEngine::CLASS_NAMES = {
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+    "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
+    "scissors", "teddy bear", "hair drier", "toothbrush"};
+
+InferenceEngine::InferenceEngine(const std::string &model_path)
+    : env(ORT_LOGGING_LEVEL_WARNING, "ONNXRuntime"),
+      session_options(),
+      session(env, model_path.c_str(), session_options),
+      input_shape{1, 3, 640, 640}
+{
+    session_options.SetIntraOpNumThreads(1);
+    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC);
+}
+
+InferenceEngine::~InferenceEngine() {}
+
+/*
+ * Function to preprocess the image
+ *
+ * @param image_path: path to the image
+ * @param orig_width: original width of the image
+ * @param orig_height: original height of the image
+ *
+ * @return: vector of floats representing the preprocessed image
+ */
+std::vector<float> InferenceEngine::preprocessImage(const cv::Mat &image)
+{
+    if (image.empty())
+    {
+        throw std::runtime_error("Could not read the image");
+    }
+
+    cv::Mat resized_image;
+    cv::resize(image, resized_image, cv::Size(input_shape[2], input_shape[3]));
+
+    resized_image.convertTo(resized_image, CV_32F, 1.0 / 255);
+
+    std::vector<cv::Mat> channels(3);
+    cv::split(resized_image, channels);
+
+    std::vector<float> input_tensor_values;
+    for (int c = 0; c < 3; ++c)
+    {
+        input_tensor_values.insert(input_tensor_values.end(), (float *)channels[c].data, (float *)channels[c].data + input_shape[2] * input_shape[3]);
+    }
+
+    return input_tensor_values;
+}
+
+/*
+    * Function to filter the detections based on the confidence threshold
+    *
+    * @param results: vector of floats representing the output tensor
+    * @param confidence_threshold: minimum confidence threshold
+    * @param img_width: width of the input image
+    * @param img_height: height of the input image
+    * @param orig_width: original width of the image
+    * @param orig_height: original height of the image
+    *
+    * @return: vector of Detection objects
+
+*/
+std::vector<Detection> InferenceEngine::filterDetections(const std::vector<float> &results, float confidence_threshold, int img_width, int img_height, int orig_width, int orig_height)
+{
+    std::vector<Detection> detections;
+    const int num_detections = results.size() / 6;
+
+    for (int i = 0; i < num_detections; ++i)
+    {
+        float left = results[i * 6 + 0];
+        float top = results[i * 6 + 1];
+        float right = results[i * 6 + 2];
+        float bottom = results[i * 6 + 3];
+        float confidence = results[i * 6 + 4];
+        int class_id = results[i * 6 + 5];
+
+        if (confidence >= confidence_threshold)
+        {
+            int x = static_cast<int>(left * orig_width / img_width);
+            int y = static_cast<int>(top * orig_height / img_height);
+            int width = static_cast<int>((right - left) * orig_width / img_width);
+            int height = static_cast<int>((bottom - top) * orig_height / img_height);
+
+            detections.push_back(
+                {confidence,
+                 cv::Rect(x, y, width, height),
+                 class_id,
+                 CLASS_NAMES[class_id]});
+        }
+    }
+
+    return detections;
+}
+
+
+/*
+    * Function to run inference
+    *
+    * @param input_tensor_values: vector of floats representing the input tensor
+    *
+    * @return: vector of floats representing the output tensor
+*/
+std::vector<float> InferenceEngine::runInference(const std::vector<float> &input_tensor_values)
+{
+    Ort::AllocatorWithDefaultOptions allocator;
+
+    std::string input_name = getInputName();
+    std::string output_name = getOutputName();
+
+    const char *input_name_ptr = input_name.c_str();
+    const char *output_name_ptr = output_name.c_str();
+
+    Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, const_cast<float *>(input_tensor_values.data()), input_tensor_values.size(), input_shape.data(), input_shape.size());
+
+    auto output_tensors = session.Run(Ort::RunOptions{nullptr}, &input_name_ptr, &input_tensor, 1, &output_name_ptr, 1);
+
+    float *floatarr = output_tensors[0].GetTensorMutableData<float>();
+    size_t output_tensor_size = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
+
+    return std::vector<float>(floatarr, floatarr + output_tensor_size);
+}
+
+/*
+    * Function to draw the labels on the image
+    *
+    * @param image: input image
+    * @param detections: vector of Detection objects
+    *
+    * @return: image with labels drawn
+
+*/
+cv::Mat InferenceEngine::draw_labels(const cv::Mat &image, const std::vector<Detection> &detections)
+{
+    cv::Mat result = image.clone();
+
+    for (const auto &detection : detections)
+    {
+        cv::rectangle(result, detection.bbox, cv::Scalar(0, 255, 0), 2);
+        std::string label = detection.class_name + ": " + std::to_string(detection.confidence);
+
+        int baseLine;
+        cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        cv::rectangle(
+            result,
+            cv::Point(detection.bbox.x, detection.bbox.y - labelSize.height),
+            cv::Point(detection.bbox.x + labelSize.width, detection.bbox.y + baseLine),
+            cv::Scalar(255, 255, 255),
+            cv::FILLED);
+
+        cv::putText(
+            result,
+            label,
+            cv::Point(
+                detection.bbox.x,
+                detection.bbox.y),
+            cv::FONT_HERSHEY_SIMPLEX,
+            0.5,
+            cv::Scalar(0, 0, 0),
+            1);
+    }
+
+    return result;
+}
+
+/*
+    * Function to get the input name
+    *
+    * @return: name of the input tensor
+*/
+std::string InferenceEngine::getInputName()
+{
+    Ort::AllocatorWithDefaultOptions allocator;
+    Ort::AllocatedStringPtr name_allocator = session.GetInputNameAllocated(0, allocator);
+    return std::string(name_allocator.get());
+}
+
+/*
+    * Function to get the output name
+    *
+    * @return: name of the output tensor
+*/
+std::string InferenceEngine::getOutputName()
+{
+    Ort::AllocatorWithDefaultOptions allocator;
+    Ort::AllocatedStringPtr name_allocator = session.GetOutputNameAllocated(0, allocator);
+    return std::string(name_allocator.get());
+}
@@ -0,0 +1,44 @@
+#ifndef INFERENCE_H
+#define INFERENCE_H
+
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/opencv.hpp>
+#include <vector>
+#include <string>
+
+struct Detection
+{
+    float confidence;
+    cv::Rect bbox;
+    int class_id;
+    std::string class_name;
+};
+
+
+class InferenceEngine
+{
+public:
+    InferenceEngine(const std::string &model_path);
+    ~InferenceEngine();
+
+    std::vector<float> preprocessImage(const cv::Mat &image);
+    std::vector<Detection> filterDetections(const std::vector<float> &results, float confidence_threshold, int img_width, int img_height, int orig_width, int orig_height);
+    std::vector<float> runInference(const std::vector<float> &input_tensor_values);
+    
+    cv::Mat draw_labels(const cv::Mat &image, const std::vector<Detection> &detections);
+
+    std::vector<int64_t> input_shape;
+    
+private:
+    Ort::Env env;
+    Ort::SessionOptions session_options;
+    Ort::Session session;
+
+    std::string getInputName();
+    std::string getOutputName();
+
+    static const std::vector<std::string> CLASS_NAMES;
+};
+
+
+#endif // INFERENCE_H
@@ -0,0 +1,44 @@
+#include "./ia/inference.h"
+#include <iostream>
+#include <opencv2/opencv.hpp>
+
+int main(int argc, char *argv[])
+{
+    if (argc != 3)
+    {
+        std::cerr << "Usage: " << argv[0] << " <model_path> <image_path>" << std::endl;
+        return 1;
+    }
+
+    std::string model_path = argv[1];
+    std::string image_path = argv[2];
+
+    try
+    {
+        InferenceEngine engine(model_path);
+
+        
+        cv::Mat image = cv::imread(image_path);
+        int orig_width = image.cols;
+        int orig_height = image.rows;
+        std::vector<float> input_tensor_values = engine.preprocessImage(image );
+
+        std::vector<float> results = engine.runInference(input_tensor_values);
+
+        float confidence_threshold = 0.5;
+
+
+        std::vector<Detection> detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height);
+
+        cv::Mat output = engine.draw_labels(image, detections);
+
+        cv::imwrite("result.jpg", output);
+    }
+    catch (const std::exception &e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
@@ -0,0 +1,2 @@
+// src/placeholder.cpp
+void placeholder_function() {}
@@ -0,0 +1,71 @@
+#include "./ia/inference.h"
+#include <iostream>
+#include <opencv2/opencv.hpp>
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 3)
+    {
+        std::cerr << "Usage: " << argv[0] << " <model_path> <source>" << std::endl;
+        return 1;
+    }
+    std::string model_path = argv[1];
+
+    auto source = argv[2]; // 0 for webcam, 1 for video file
+    int apiID = cv::CAP_ANY;     // 0 = autodetect default API
+
+    cv::namedWindow("yolov10", cv::WINDOW_AUTOSIZE);
+
+    InferenceEngine engine(model_path);
+
+    cv::VideoCapture cap(source);
+
+    //cap.open(source, apiID);
+
+    if (!cap.isOpened())
+    {
+        std::cerr << "ERROR! Unable to open video\n";
+        return -1;
+    }
+
+    cv::Mat frame;
+
+    std::cout << "Start grabbing" << std::endl
+              << "Press any key to terminate" << std::endl;
+
+    for (;;)
+    {
+        cap.read(frame);
+
+        if (frame.empty())
+        {
+            std::cerr << "ERROR! blank frame grabbed\n";
+            continue;
+        }
+
+        int orig_width = frame.cols;
+        int orig_height = frame.rows;
+        auto timer = cv::getTickCount();
+
+        std::vector<float> input_tensor_values = engine.preprocessImage(frame);
+
+        std::vector<float> results = engine.runInference(input_tensor_values);
+
+        float confidence_threshold = 0.3;
+
+        std::vector<Detection> detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height);
+
+        double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - timer);
+
+        cv::putText(frame, "FPS: " + std::to_string(fps), cv::Point(10, 30), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2, 8);
+
+        cv::Mat output = engine.draw_labels(frame, detections);
+
+        cv::imshow("test", output);
+
+        if (cv::waitKey(5) >= 0)
+            break;
+    }
+
+    return 0;
+}
@@ -0,0 +1,73 @@
+#include "./ia/inference.h"
+#include <iostream>
+#include <opencv2/opencv.hpp>
+
+int main(int argc, char const *argv[])
+{
+    if (argc != 2)
+    {
+        std::cerr << "Usage: " << argv[0] << " <model_path>" << std::endl;
+        return 1;
+    }
+    
+    std::string model_path = argv[1];
+    std::string rtsp_url = "rtsp://localhost:8554/live.stream";
+
+    auto source = argv[2]; // 0 for webcam, 1 for video file
+    int apiID = cv::CAP_ANY;     // 0 = autodetect default API
+
+    cv::namedWindow("yolov10", cv::WINDOW_AUTOSIZE);
+
+    InferenceEngine engine(model_path);
+
+    cv::VideoCapture cap(rtsp_url);
+
+    //cap.open(source, apiID);
+
+    if (!cap.isOpened())
+    {
+        std::cerr << "ERROR! Unable to open video\n";
+        return -1;
+    }
+
+    cv::Mat frame;
+
+    std::cout << "Start grabbing" << std::endl
+              << "Press any key to terminate" << std::endl;
+
+    for (;;)
+    {
+        cap.read(frame);
+
+        if (frame.empty())
+        {
+            std::cerr << "ERROR! blank frame grabbed\n";
+            continue;
+        }
+
+        int orig_width = frame.cols;
+        int orig_height = frame.rows;
+        auto timer = cv::getTickCount();
+
+        std::vector<float> input_tensor_values = engine.preprocessImage(frame);
+
+        std::vector<float> results = engine.runInference(input_tensor_values);
+
+        float confidence_threshold = 0.3;
+
+        std::vector<Detection> detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height);
+
+        double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - timer);
+
+        cv::putText(frame, "FPS: " + std::to_string(fps), cv::Point(10, 30), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2, 8);
+
+        cv::Mat output = engine.draw_labels(frame, detections);
+
+        cv::imshow("test", output);
+
+        if (cv::waitKey(5) >= 0)
+            break;
+    }
+
+    return 0;
+}