diff --git a/tmp/yolov10-cpp-testing/CMakeLists.txt b/tmp/yolov10-cpp-testing/CMakeLists.txt new file mode 100644 index 0000000..b15f48b --- /dev/null +++ b/tmp/yolov10-cpp-testing/CMakeLists.txt @@ -0,0 +1,61 @@ +cmake_minimum_required(VERSION 3.10) + +# Set the project name in a variable +set(project_name yolov10_cpp) +project(${project_name}) +set(CMAKE_CXX_STANDARD 17) + +find_package(OpenCV REQUIRED) + +# Find ONNX Runtime package +find_path(ONNXRUNTIME_INCLUDE_DIR onnxruntime_c_api.h + HINTS /opt/onnxruntime-linux-x64-rocm-1.18.0/include +) +find_library(ONNXRUNTIME_LIBRARY onnxruntime + HINTS /opt/onnxruntime-linux-x64-rocm-1.18.0/lib +) + +if(NOT ONNXRUNTIME_INCLUDE_DIR) + message(FATAL_ERROR "ONNX Runtime include directory not found") +endif() +if(NOT ONNXRUNTIME_LIBRARY) + message(FATAL_ERROR "ONNX Runtime library not found") +endif() + +add_library(${project_name}-lib + src/placeholder.cpp + src/ia/inference.cpp + src/ia/inference.h +) + +target_include_directories(${project_name}-lib PUBLIC src) +target_include_directories(${project_name}-lib PUBLIC ${ONNXRUNTIME_INCLUDE_DIR}) + +target_link_libraries(${project_name}-lib + PUBLIC ${OpenCV_LIBS} + PUBLIC ${ONNXRUNTIME_LIBRARY} +) + +# Add the main executable +add_executable(${project_name} + ./src/main.cpp +) +target_include_directories(${project_name} PUBLIC ${ONNXRUNTIME_INCLUDE_DIR}) +target_link_libraries(${project_name} ${project_name}-lib) + +# Add the video executable +add_executable(${project_name}_video + ./src/video.cpp +) + +# Add the video executable +add_executable(${project_name}_video_rtsp + ./src/video_rtsp.cpp +) + +target_include_directories(${project_name}_video PUBLIC ${ONNXRUNTIME_INCLUDE_DIR}) +target_link_libraries(${project_name}_video ${project_name}-lib) + +target_include_directories(${project_name}_video_rtsp PUBLIC ${ONNXRUNTIME_INCLUDE_DIR}) +target_link_libraries(${project_name}_video_rtsp ${project_name}-lib) + diff --git a/tmp/yolov10-cpp-testing/src/ia/inference.cpp b/tmp/yolov10-cpp-testing/src/ia/inference.cpp new file mode 100644 index 0000000..38cd77c --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/ia/inference.cpp @@ -0,0 +1,200 @@ +#include "inference.h" +#include +#include + +const std::vector InferenceEngine::CLASS_NAMES = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", + "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", + "scissors", "teddy bear", "hair drier", "toothbrush"}; + +InferenceEngine::InferenceEngine(const std::string &model_path) + : env(ORT_LOGGING_LEVEL_WARNING, "ONNXRuntime"), + session_options(), + session(env, model_path.c_str(), session_options), + input_shape{1, 3, 640, 640} +{ + session_options.SetIntraOpNumThreads(1); + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC); +} + +InferenceEngine::~InferenceEngine() {} + +/* + * Function to preprocess the image + * + * @param image_path: path to the image + * @param orig_width: original width of the image + * @param orig_height: original height of the image + * + * @return: vector of floats representing the preprocessed image + */ +std::vector InferenceEngine::preprocessImage(const cv::Mat &image) +{ + if (image.empty()) + { + throw std::runtime_error("Could not read the image"); + } + + cv::Mat resized_image; + cv::resize(image, resized_image, cv::Size(input_shape[2], input_shape[3])); + + resized_image.convertTo(resized_image, CV_32F, 1.0 / 255); + + std::vector channels(3); + cv::split(resized_image, channels); + + std::vector input_tensor_values; + for (int c = 0; c < 3; ++c) + { + input_tensor_values.insert(input_tensor_values.end(), (float *)channels[c].data, (float *)channels[c].data + input_shape[2] * input_shape[3]); + } + + return input_tensor_values; +} + +/* + * Function to filter the detections based on the confidence threshold + * + * @param results: vector of floats representing the output tensor + * @param confidence_threshold: minimum confidence threshold + * @param img_width: width of the input image + * @param img_height: height of the input image + * @param orig_width: original width of the image + * @param orig_height: original height of the image + * + * @return: vector of Detection objects + +*/ +std::vector InferenceEngine::filterDetections(const std::vector &results, float confidence_threshold, int img_width, int img_height, int orig_width, int orig_height) +{ + std::vector detections; + const int num_detections = results.size() / 6; + + for (int i = 0; i < num_detections; ++i) + { + float left = results[i * 6 + 0]; + float top = results[i * 6 + 1]; + float right = results[i * 6 + 2]; + float bottom = results[i * 6 + 3]; + float confidence = results[i * 6 + 4]; + int class_id = results[i * 6 + 5]; + + if (confidence >= confidence_threshold) + { + int x = static_cast(left * orig_width / img_width); + int y = static_cast(top * orig_height / img_height); + int width = static_cast((right - left) * orig_width / img_width); + int height = static_cast((bottom - top) * orig_height / img_height); + + detections.push_back( + {confidence, + cv::Rect(x, y, width, height), + class_id, + CLASS_NAMES[class_id]}); + } + } + + return detections; +} + + +/* + * Function to run inference + * + * @param input_tensor_values: vector of floats representing the input tensor + * + * @return: vector of floats representing the output tensor +*/ +std::vector InferenceEngine::runInference(const std::vector &input_tensor_values) +{ + Ort::AllocatorWithDefaultOptions allocator; + + std::string input_name = getInputName(); + std::string output_name = getOutputName(); + + const char *input_name_ptr = input_name.c_str(); + const char *output_name_ptr = output_name.c_str(); + + Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, const_cast(input_tensor_values.data()), input_tensor_values.size(), input_shape.data(), input_shape.size()); + + auto output_tensors = session.Run(Ort::RunOptions{nullptr}, &input_name_ptr, &input_tensor, 1, &output_name_ptr, 1); + + float *floatarr = output_tensors[0].GetTensorMutableData(); + size_t output_tensor_size = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount(); + + return std::vector(floatarr, floatarr + output_tensor_size); +} + +/* + * Function to draw the labels on the image + * + * @param image: input image + * @param detections: vector of Detection objects + * + * @return: image with labels drawn + +*/ +cv::Mat InferenceEngine::draw_labels(const cv::Mat &image, const std::vector &detections) +{ + cv::Mat result = image.clone(); + + for (const auto &detection : detections) + { + cv::rectangle(result, detection.bbox, cv::Scalar(0, 255, 0), 2); + std::string label = detection.class_name + ": " + std::to_string(detection.confidence); + + int baseLine; + cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + cv::rectangle( + result, + cv::Point(detection.bbox.x, detection.bbox.y - labelSize.height), + cv::Point(detection.bbox.x + labelSize.width, detection.bbox.y + baseLine), + cv::Scalar(255, 255, 255), + cv::FILLED); + + cv::putText( + result, + label, + cv::Point( + detection.bbox.x, + detection.bbox.y), + cv::FONT_HERSHEY_SIMPLEX, + 0.5, + cv::Scalar(0, 0, 0), + 1); + } + + return result; +} + +/* + * Function to get the input name + * + * @return: name of the input tensor +*/ +std::string InferenceEngine::getInputName() +{ + Ort::AllocatorWithDefaultOptions allocator; + Ort::AllocatedStringPtr name_allocator = session.GetInputNameAllocated(0, allocator); + return std::string(name_allocator.get()); +} + +/* + * Function to get the output name + * + * @return: name of the output tensor +*/ +std::string InferenceEngine::getOutputName() +{ + Ort::AllocatorWithDefaultOptions allocator; + Ort::AllocatedStringPtr name_allocator = session.GetOutputNameAllocated(0, allocator); + return std::string(name_allocator.get()); +} diff --git a/tmp/yolov10-cpp-testing/src/ia/inference.h b/tmp/yolov10-cpp-testing/src/ia/inference.h new file mode 100644 index 0000000..9828629 --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/ia/inference.h @@ -0,0 +1,44 @@ +#ifndef INFERENCE_H +#define INFERENCE_H + +#include +#include +#include +#include + +struct Detection +{ + float confidence; + cv::Rect bbox; + int class_id; + std::string class_name; +}; + + +class InferenceEngine +{ +public: + InferenceEngine(const std::string &model_path); + ~InferenceEngine(); + + std::vector preprocessImage(const cv::Mat &image); + std::vector filterDetections(const std::vector &results, float confidence_threshold, int img_width, int img_height, int orig_width, int orig_height); + std::vector runInference(const std::vector &input_tensor_values); + + cv::Mat draw_labels(const cv::Mat &image, const std::vector &detections); + + std::vector input_shape; + +private: + Ort::Env env; + Ort::SessionOptions session_options; + Ort::Session session; + + std::string getInputName(); + std::string getOutputName(); + + static const std::vector CLASS_NAMES; +}; + + +#endif // INFERENCE_H diff --git a/tmp/yolov10-cpp-testing/src/main.cpp b/tmp/yolov10-cpp-testing/src/main.cpp new file mode 100644 index 0000000..6a52983 --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/main.cpp @@ -0,0 +1,44 @@ +#include "./ia/inference.h" +#include +#include + +int main(int argc, char *argv[]) +{ + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string model_path = argv[1]; + std::string image_path = argv[2]; + + try + { + InferenceEngine engine(model_path); + + + cv::Mat image = cv::imread(image_path); + int orig_width = image.cols; + int orig_height = image.rows; + std::vector input_tensor_values = engine.preprocessImage(image ); + + std::vector results = engine.runInference(input_tensor_values); + + float confidence_threshold = 0.5; + + + std::vector detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height); + + cv::Mat output = engine.draw_labels(image, detections); + + cv::imwrite("result.jpg", output); + } + catch (const std::exception &e) + { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + + return 0; +} diff --git a/tmp/yolov10-cpp-testing/src/placeholder.cpp b/tmp/yolov10-cpp-testing/src/placeholder.cpp new file mode 100644 index 0000000..fe5f0b8 --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/placeholder.cpp @@ -0,0 +1,2 @@ +// src/placeholder.cpp +void placeholder_function() {} diff --git a/tmp/yolov10-cpp-testing/src/video.cpp b/tmp/yolov10-cpp-testing/src/video.cpp new file mode 100644 index 0000000..e4791f6 --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/video.cpp @@ -0,0 +1,71 @@ +#include "./ia/inference.h" +#include +#include + +int main(int argc, char const *argv[]) +{ + if (argc != 3) + { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + std::string model_path = argv[1]; + + auto source = argv[2]; // 0 for webcam, 1 for video file + int apiID = cv::CAP_ANY; // 0 = autodetect default API + + cv::namedWindow("yolov10", cv::WINDOW_AUTOSIZE); + + InferenceEngine engine(model_path); + + cv::VideoCapture cap(source); + + //cap.open(source, apiID); + + if (!cap.isOpened()) + { + std::cerr << "ERROR! Unable to open video\n"; + return -1; + } + + cv::Mat frame; + + std::cout << "Start grabbing" << std::endl + << "Press any key to terminate" << std::endl; + + for (;;) + { + cap.read(frame); + + if (frame.empty()) + { + std::cerr << "ERROR! blank frame grabbed\n"; + continue; + } + + int orig_width = frame.cols; + int orig_height = frame.rows; + auto timer = cv::getTickCount(); + + std::vector input_tensor_values = engine.preprocessImage(frame); + + std::vector results = engine.runInference(input_tensor_values); + + float confidence_threshold = 0.3; + + std::vector detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height); + + double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - timer); + + cv::putText(frame, "FPS: " + std::to_string(fps), cv::Point(10, 30), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2, 8); + + cv::Mat output = engine.draw_labels(frame, detections); + + cv::imshow("test", output); + + if (cv::waitKey(5) >= 0) + break; + } + + return 0; +} diff --git a/tmp/yolov10-cpp-testing/src/video_rtsp.cpp b/tmp/yolov10-cpp-testing/src/video_rtsp.cpp new file mode 100644 index 0000000..a159bc1 --- /dev/null +++ b/tmp/yolov10-cpp-testing/src/video_rtsp.cpp @@ -0,0 +1,73 @@ +#include "./ia/inference.h" +#include +#include + +int main(int argc, char const *argv[]) +{ + if (argc != 2) + { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string model_path = argv[1]; + std::string rtsp_url = "rtsp://localhost:8554/live.stream"; + + auto source = argv[2]; // 0 for webcam, 1 for video file + int apiID = cv::CAP_ANY; // 0 = autodetect default API + + cv::namedWindow("yolov10", cv::WINDOW_AUTOSIZE); + + InferenceEngine engine(model_path); + + cv::VideoCapture cap(rtsp_url); + + //cap.open(source, apiID); + + if (!cap.isOpened()) + { + std::cerr << "ERROR! Unable to open video\n"; + return -1; + } + + cv::Mat frame; + + std::cout << "Start grabbing" << std::endl + << "Press any key to terminate" << std::endl; + + for (;;) + { + cap.read(frame); + + if (frame.empty()) + { + std::cerr << "ERROR! blank frame grabbed\n"; + continue; + } + + int orig_width = frame.cols; + int orig_height = frame.rows; + auto timer = cv::getTickCount(); + + std::vector input_tensor_values = engine.preprocessImage(frame); + + std::vector results = engine.runInference(input_tensor_values); + + float confidence_threshold = 0.3; + + std::vector detections = engine.filterDetections(results, confidence_threshold, engine.input_shape[2], engine.input_shape[3], orig_width, orig_height); + + double fps = cv::getTickFrequency() / ((double)cv::getTickCount() - timer); + + cv::putText(frame, "FPS: " + std::to_string(fps), cv::Point(10, 30), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2, 8); + + cv::Mat output = engine.draw_labels(frame, detections); + + cv::imshow("test", output); + + if (cv::waitKey(5) >= 0) + break; + } + + return 0; +}