autopilot/misc/opencv-dnn-onnx-inference/main.cpp

#include <QDebug>
#include <QElapsedTimer>
#include <opencv2/imgproc.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>

#define CONFIDENCE_THRESHOLD 0.2

using namespace std;
using namespace cv;

// function to create vector of class names
std::vector<std::string> createClaseNames() {
    std::vector<std::string> classNames;
    classNames.push_back("background");
    classNames.push_back("aeroplane");
    classNames.push_back("bicycle");
    classNames.push_back("bird");
    classNames.push_back("boat");
    classNames.push_back("bottle");
    classNames.push_back("bus");
    classNames.push_back("car");
    classNames.push_back("cat");
    classNames.push_back("chair");
    classNames.push_back("cow");
    classNames.push_back("diningtable");
    classNames.push_back("dog");
    classNames.push_back("horse");
    classNames.push_back("motorbike");
    classNames.push_back("person");
    classNames.push_back("pottedplant");
    classNames.push_back("sheep");
    return classNames;
};

int main(int argc, char *argv[])
{
    if (argc < 3) {
        qDebug() << "Give ONNX model as first argument and image as second one.";
        return 1;
    }
     std::vector<String> classNames = createClaseNames();

    std::string model_filename = argv[1];
    std::string image_filename = argv[2];

    // Load the ONNX model
    cv::dnn::Net net = cv::dnn::readNetFromONNX(model_filename);

    // Set backend to OpenCL
    net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
    //net.setPreferableTarget(cv::dnn::DNN_TARGET_OPENCL);

    // Load an image and preprocess it
    cv::Mat image = cv::imread(image_filename);
    cv::Mat blob = cv::dnn::blobFromImage(image, 1/255.0, cv::Size(640, 640), cv::Scalar(), true, false);
    cv::imwrite("tmpInput.png", image);

    // Set the input for the network
    net.setInput(blob);
    std::vector<cv::Mat> outputs;
    net.forward(outputs);
    cv::Mat output = outputs[0];

    std::vector<cv::String> layerNames = net.getLayerNames();
    std::vector<int> outLayerIndices = net.getUnconnectedOutLayers();
    for (int index : outLayerIndices) {
        std::cout << layerNames[index - 1] << std::endl;
    }


    cv::Mat detections = net.forward("output0");

    // print some information about detections
    std::cout << "dims: " << outputs[0].dims << std::endl;
    std::cout << "size: " << outputs[0].size << std::endl;

    int num_detections0 = outputs.size();
    int num_classes0 = outputs[0].size[1];
    int rows = outputs[0].size[1];
    qDebug() << "num_detections0:" << num_detections0 << "num_classes0:" << num_classes0 << "rows:" << rows;

    int num_detections = detections.size[2];  // 8400
    int num_attributes = detections.size[1];  // 14
    qDebug() << "num_detections:" << num_detections << "num_attributes:" << num_attributes;


    detections = detections.reshape(1, num_detections);

    for (int i = 0; i < num_detections; i++) {
        Mat row = detections.row(i);
        float box_confidence = row.at<float>(4);
        float cls_confidence = row.at<float>(5);
        if (box_confidence > 0.5 && cls_confidence > 0.5) {
            std::cout << "box_confidence " << box_confidence << " cls_confidence: " << cls_confidence << std::endl;
        }
    }


    /*
    // Output tensor processing
    // Assuming output tensor has shape [1, 14, 8400]
    int num_detections = detections.size[2];  // 8400
    int num_attributes = detections.size[1];  // 14
    qDebug() << "num_detections:" << num_detections << "num_attributes:" << num_attributes;

    // Extract and print confidence for each detection
    const float* data = (float*)output.data;
    for (int i = 0; i < num_detections; i++) {
        // Confidence value is at index 4 (based on YOLO-like model output format)
        float confidence = data[i * num_attributes + 3];
        // Print confidence value
        if (confidence > 0.5f) {
            std::cout << "Detection " << i << " confidence: " << confidence << std::endl;
        }
    }
    */

    /*
    // Assuming outputs is a vector of cv::Mat containing the model's output
    for (int i = 0; i < outputs[0].size[0]; ++i) {
        for (int j = 0; j < outputs[0].size[1]; ++j) {
            float confidence = outputs[0].at<float>(i, j);
            // Print or use the confidence value
            std::cout << "Confidence: " << confidence << std::endl;
        }
    }
    */

    /*
    // Output processing variables
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;

    // Analyze each output
    for (const auto& output : outputs) {
        // Each row is a detection: [center_x, center_y, width, height, conf, class1_score, class2_score, ...]
        const auto* data = (float*)output.data;

        for (int i = 0; i < output.rows; i++, data += output.cols) {
            float confidence = data[4];  // Objectness confidence

            if (confidence >= 0.5) {  // Apply a confidence threshold
                cv::Mat scores = output.row(i).colRange(5, output.cols);
                cv::Point class_id_point;
                double max_class_score;

                cv::minMaxLoc(scores, 0, &max_class_score, 0, &class_id_point);
                int class_id = class_id_point.x;

                if (max_class_score >= 0.5) {  // Apply a class score threshold
                    // Get bounding box
                    int center_x = (int)(data[0] * image.cols);
                    int center_y = (int)(data[1] * image.rows);
                    int width = (int)(data[2] * image.cols);
                    int height = (int)(data[3] * image.rows);
                    int left = center_x - width / 2;
                    int top = center_y - height / 2;

                    // Store the results
                    class_ids.push_back(class_id);
                    confidences.push_back(confidence);
                    boxes.push_back(cv::Rect(left, top, width, height));
                }
            }
        }
    }
    */

    /*
    for (int i = 0; i < num_detections; ++i) {
        for (int a = 0; a < 10; a++) {
            qDebug() << "confidence:" << confidence << "class_id" << class_id;
        }
        //float confidence = outputs[0].at<float>(i, 4);
        //int class_id = outputs[0].at<float>(i, 5);
        qDebug() << "confidence:" << confidence << "class_id" << class_id;
    }
    */

    /*
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;
    // Output tensor processing
    for (size_t i = 0; i < outputs.size(); i++) {
        float* data = (float*)outputs[i].data;
        for (int j = 0; j < outputs[i].rows; ++j) {
            float confidence = data[4]; // Objectness confidence
            qDebug() <<" Confidence: " << confidence;

            if (confidence >= CONFIDENCE_THRESHOLD) {
                // Get class with the highest score
                cv::Mat scores = outputs[i].row(j).colRange(5, num_classes + 5);
                cv::Point class_id_point;
                double max_class_score;
                cv::minMaxLoc(scores, 0, &max_class_score, 0, &class_id_point);

                qDebug() << "Class ID: " << class_id_point.x << " Confidence: " << confidence;

                if (max_class_score > CONFIDENCE_THRESHOLD) {
                    int center_x = (int)(data[0] * image.cols);
                    int center_y = (int)(data[1] * image.rows);
                    int width = (int)(data[2] * image.cols);
                    int height = (int)(data[3] * image.rows);
                    int left = center_x - width / 2;
                    int top = center_y - height / 2;

                    // Store the results
                    class_ids.push_back(class_id_point.x);
                    confidences.push_back((float)max_class_score);
                    boxes.push_back(cv::Rect(left, top, width, height));
                }
            }
            data += num_classes + 5; // Move to the next detection
        }
    }
    */

    /*
    // Process the output
    for (int i = 0; i < num_detections; ++i) {
        float confidence = outputs[0].at<float>(i, 4);
        int class_id = outputs[0].at<float>(i, 5);
        // ... extract bounding box coordinates
        float x_center = outputs[0].at<float>(i, 0);
        float y_center = outputs[0].at<float>(i, 1);
        float width = outputs[0].at<float>(i, 2);
        float height = outputs[0].at<float>(i, 3);

        // Calculate bounding box corners
        int x = (int)(x_center - width / 2.0);
        int y = (int)(y_center - height / 2.0);
        int width_int = (int)width;
        int height_int = (int)height;

        // Print or use the detected information
        qDebug() << "Class ID: " << class_id << " Confidence: " << confidence << " Bounding Box: (" << x << ", " << y << ", " << width_int << ", " << height_int << ")";
    }
    */


    /*
    // Perform forward pass
    for (int i = 0; i < 10; i++) {
        std::vector<cv::Mat> outputs;
        QElapsedTimer timer;
        timer.start();
        net.forward(outputs);
        qDebug() << "Inference completed in" << timer.elapsed() << "ms.";
    }
    */

    // Process the output (YOLO-specific post-processing like NMS is needed)
    // Outputs contain class scores, bounding boxes, etc.


    return 0;
}