add export to FP16

add inference with possibility to have different
2026-06-22 09:11:11 +00:00 · 2025-03-28 12:54:25 +02:00
parent eaef1a9b66
commit 5b89a21b36
9 changed files with 365 additions and 242 deletions
@@ -0,0 +1,62 @@
+import json
+from enum import Enum
+from os.path import join, dirname
+
+
+class Detection:
+    def __init__(self, x, y, w, h, cls, confidence):
+        self.x = x
+        self.y = y
+        self.w = w
+        self.h = h
+        self.cls = cls
+        self.confidence = confidence
+
+    def overlaps(self, det2, iou_threshold):
+        overlap_x = 0.5 * (self.w + det2.w) - abs(self.x - det2.x)
+        overlap_y = 0.5 * (self.h + det2.h) - abs(self.y - det2.y)
+        intersection = max(0, overlap_x) * max(0, overlap_y)
+        union = self.w * self.h + det2.w * det2.h - intersection
+
+        return intersection / union > iou_threshold
+
+
+class Annotation:
+    def __init__(self, frame, time, detections: list[Detection]):
+        self.frame = frame
+        self.time = time
+        self.detections = detections if detections is not None else []
+
+
+class WeatherMode(Enum):
+    Norm = 0
+    Wint = 20
+    Night = 40
+
+class AnnotationClass:
+    def __init__(self, id, name, color):
+        self.id = id
+        self.name = name
+        self.color = color
+        color_str = color.lstrip('#')
+        self.opencv_color = (int(color_str[4:6], 16), int(color_str[2:4], 16), int(color_str[0:2], 16))
+
+    @staticmethod
+    def read_json():
+        classes_path = join(dirname(dirname(__file__)), 'classes.json')
+        with open(classes_path, 'r', encoding='utf-8') as f:
+            j = json.loads(f.read())
+            annotations_dict = {}
+            for mode in WeatherMode:
+                for cl in j:
+                    id = mode.value + cl['Id']
+                    name = cl['Name'] if mode.value == 0 else f'{cl["Name"]}({mode.name})'
+                    annotations_dict[id] = AnnotationClass(id, name, cl['Color'])
+            return annotations_dict
+
+    @property
+    def color_tuple(self):
+        color = self.color[3:]
+        lv = len(color)
+        xx = range(0, lv, lv // 3)
+        return tuple(int(color[i:i + lv // 3], 16) for i in xx)
@@ -0,0 +1,140 @@
+import cv2
+import numpy as np
+
+from onnx_engine import InferenceEngine
+from dto import AnnotationClass, Annotation, Detection
+
+
+class Inference:
+    def __init__(self, engine: InferenceEngine, confidence_threshold, iou_threshold):
+        self.engine = engine
+        self.confidence_threshold = confidence_threshold
+        self.iou_threshold = iou_threshold
+        self.batch_size = engine.get_batch_size()
+
+        self.model_height, self.model_width = engine.get_input_shape()
+        self.classes = AnnotationClass.read_json()
+
+    def draw(self, annotation: Annotation):
+        img = annotation.frame
+        img_height, img_width = img.shape[:2]
+        for d in annotation.detections:
+            x1 = int(img_width * (d.x - d.w / 2))
+            y1 = int(img_height * (d.y - d.h / 2))
+            x2 = int(x1 + img_width * d.w)
+            y2 = int(y1 + img_height * d.h)
+
+            color = self.classes[d.cls].opencv_color
+            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+            label = f"{self.classes[d.cls].name}: {d.confidence:.2f}"
+            (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+
+            label_y = y1 - 10 if y1 - 10 > label_height else y1 + 10
+
+            cv2.rectangle(
+                img, (x1, label_y - label_height), (x1 + label_width, label_y + label_height), color, cv2.FILLED
+            )
+            cv2.putText(img, label, (x1, label_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
+        cv2.imshow('Video', img)
+
+    def preprocess(self, frames):
+        blobs = [cv2.dnn.blobFromImage(frame,
+                                       scalefactor=1.0 / 255.0,
+                                       size=(self.model_width, self.model_height),
+                                       mean=(0, 0, 0),
+                                       swapRB=True,
+                                       crop=False)
+                 for frame in frames]
+        return np.vstack(blobs)
+
+    def postprocess(self, batch_frames, batch_timestamps, output):
+        anns = []
+        for i in range(len(output[0])):
+            frame = batch_frames[i]
+            timestamp = batch_timestamps[i]
+            detections = []
+            for det in output[0][i]:
+                if det[4] == 0:
+                    break
+                if det[4] < self.confidence_threshold:
+                    continue
+
+                x1 = max(0, det[0] / self.model_width)
+                y1 = max(0, det[1] / self.model_height)
+                x2 = min(1, det[2] / self.model_width)
+                y2 = min(1, det[3] / self.model_height)
+                conf = round(det[4], 2)
+                class_id = int(det[5])
+
+                x = (x1 + x2) / 2
+                y = (y1 + y2) / 2
+                w = x2 - x1
+                h = y2 - y1
+                detections.append(Detection(x, y, w, h, class_id, conf))
+
+            filtered_detections = self.remove_overlapping_detections(detections)
+
+            # if len(filtered_detections) > 0:
+            # _, image = cv2.imencode('.jpg', frame)
+            # image_bytes = image.tobytes()
+            annotation = Annotation(frame, timestamp, filtered_detections)
+            anns.append(annotation)
+        return anns
+
+    def process(self, video):
+        frame_count = 0
+        batch_frames = []
+        batch_timestamps = []
+        v_input = cv2.VideoCapture(video)
+        while v_input.isOpened():
+            ret, frame = v_input.read()
+            if not ret or frame is None:
+                break
+
+            frame_count += 1
+            if frame_count % 4 == 0:
+                batch_frames.append(frame)
+                batch_timestamps.append(int(v_input.get(cv2.CAP_PROP_POS_MSEC)))
+
+            if len(batch_frames) == self.batch_size:
+                input_blob = self.preprocess(batch_frames)
+                outputs = self.engine.run(input_blob)
+                annotations = self.postprocess(batch_frames, batch_timestamps, outputs)
+                for annotation in annotations:
+                    self.draw(annotation)
+                    print(f'video: {annotation.time / 1000:.3f}s')
+                    if cv2.waitKey(1) & 0xFF == ord('q'):
+                        break
+                batch_frames.clear()
+                batch_timestamps.clear()
+
+        if len(batch_frames) > 0:
+            input_blob = self.preprocess(batch_frames)
+            outputs = self.engine.run(input_blob)
+            annotations = self.postprocess(batch_frames, batch_timestamps, outputs)
+            for annotation in annotations:
+                self.draw(annotation)
+                print(f'video: {annotation.time / 1000:.3f}s')
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+
+    def remove_overlapping_detections(self, detections):
+        filtered_output = []
+        filtered_out_indexes = []
+
+        for det1_index in range(len(detections)):
+            if det1_index in filtered_out_indexes:
+                continue
+            det1 = detections[det1_index]
+            res = det1_index
+            for det2_index in range(det1_index + 1, len(detections)):
+                det2 = detections[det2_index]
+                if det1.overlaps(det2, self.iou_threshold):
+                    if det1.confidence > det2.confidence or (det1.confidence == det2.confidence and det1.cls < det2.cls):
+                        filtered_out_indexes.append(det2_index)
+                    else:
+                        filtered_out_indexes.append(res)
+                        res = det2_index
+            filtered_output.append(detections[res])
+            filtered_out_indexes.append(res)
+        return filtered_output
@@ -0,0 +1,43 @@
+import abc
+from typing import List, Tuple
+import numpy as np
+import onnxruntime as onnx
+
+
+class InferenceEngine(abc.ABC):
+    @abc.abstractmethod
+    def __init__(self, model_path: str, batch_size: int = 1, **kwargs):
+        pass
+
+    @abc.abstractmethod
+    def get_input_shape(self) -> Tuple[int, int]:
+        pass
+
+    @abc.abstractmethod
+    def get_batch_size(self) -> int:
+        pass
+
+    @abc.abstractmethod
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        pass
+
+
+
+class OnnxEngine(InferenceEngine):
+    def __init__(self, model_path: str, batch_size: int = 1, **kwargs):
+        self.model_path = model_path
+        self.batch_size = batch_size
+        self.session = onnx.InferenceSession(model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        self.model_inputs = self.session.get_inputs()
+        self.input_name = self.model_inputs[0].name
+        self.input_shape = self.model_inputs[0].shape
+
+    def get_input_shape(self) -> Tuple[int, int]:
+        shape = self.input_shape
+        return shape[2], shape[3]
+
+    def get_batch_size(self) -> int:
+        return self.batch_size
+
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        return self.session.run(None, {self.input_name: input_data})
@@ -0,0 +1,20 @@
+from onnx_engine import OnnxEngine
+from tensorrt_engine import TensorRTEngine
+from inference import Inference
+
+if __name__ == "__main__":
+    # Inference(OnnxEngine('azaion-2025-03-10.onnx', batch_size=4),
+    #            confidence_threshold=0.5, iou_threshold=0.3).process('ForAI_test.mp4')
+    # detection for the first 200sec of video:
+    # onnxInference: 81 sec, 6.3Gb VRAM
+    # tensorrt: 54 sec, 3.7Gb VRAM
+
+    # Inference(TensorRTEngine('azaion-2025-03-10_int8.engine', batch_size=16),
+    #          confidence_threshold=0.5, iou_threshold=0.3).process('ForAI_test.mp4')
+    # INT8 for 200sec: 54 sec 3.7Gb
+
+    # Inference(TensorRTEngine('azaion-2025-03-10_batch8.engine', batch_size=8),
+    #           confidence_threshold=0.5, iou_threshold=0.3).process('ForAI_test.mp4')
+
+    Inference(TensorRTEngine('azaion-2025-03-10-half_batch4.engine', batch_size=4),
+               confidence_threshold=0.5, iou_threshold=0.3).process('ForAI_test.mp4')
@@ -0,0 +1,92 @@
+from pathlib import Path
+from typing import List, Tuple
+import json
+import numpy as np
+import tensorrt as trt
+import pycuda.driver as cuda
+import pycuda.autoinit # required for automatically initialize CUDA, do not remove.
+
+from onnx_engine import InferenceEngine
+
+
+class TensorRTEngine(InferenceEngine):
+    def __init__(self, model_path: str, batch_size: int = 4, **kwargs):
+        self.model_path = model_path
+        self.batch_size = batch_size
+
+        try:
+            logger = trt.Logger(trt.Logger.WARNING)
+
+            with open(model_path, 'rb') as f:
+                metadata_len = int.from_bytes(f.read(4), byteorder='little', signed=True)
+                metadata_bytes = f.read(metadata_len)
+                try:
+                    self.metadata = json.loads(metadata_bytes)
+                    print(f"Model metadata: {json.dumps(self.metadata, indent=2)}")
+                except json.JSONDecodeError:
+                    print(f"Failed to parse metadata: {metadata_bytes}")
+                    self.metadata = {}
+                engine_data = f.read()
+
+            runtime = trt.Runtime(logger)
+            self.engine = runtime.deserialize_cuda_engine(engine_data)
+
+            if self.engine is None:
+                raise RuntimeError(f"Failed to load TensorRT engine from {model_path}")
+
+            self.context = self.engine.create_execution_context()
+
+            # input
+            self.input_name = self.engine.get_tensor_name(0)
+            engine_input_shape = self.engine.get_tensor_shape(self.input_name)
+            self.input_shape = [
+                batch_size if engine_input_shape[0] == -1 else engine_input_shape[0],
+                engine_input_shape[1],  # Channels (usually fixed at 3 for RGB)
+                1280 if engine_input_shape[2] == -1 else engine_input_shape[2],  # Height
+                1280 if engine_input_shape[3] == -1 else engine_input_shape[3]  # Width
+            ]
+            self.context.set_input_shape(self.input_name, self.input_shape)
+            input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize
+            self.d_input = cuda.mem_alloc(input_size)
+
+            # output
+            self.output_name = self.engine.get_tensor_name(1)
+            engine_output_shape = tuple(self.engine.get_tensor_shape(self.output_name))
+            self.output_shape = [
+                batch_size if self.input_shape[0] == -1 else self.input_shape[0],
+                300 if engine_output_shape[1] == -1 else engine_output_shape[1],  # max detections number
+                6 if engine_output_shape[2] == -1 else engine_output_shape[2]  # x1 y1 x2 y2 conf cls
+            ]
+            self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32)
+            self.d_output = cuda.mem_alloc(self.h_output.nbytes)
+
+            self.stream = cuda.Stream()
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
+
+    def get_input_shape(self) -> Tuple[int, int]:
+        return self.input_shape[2], self.input_shape[3]
+
+    def get_batch_size(self) -> int:
+        return self.batch_size
+
+    # In tensorrt_engine.py, modify the run method:
+
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        try:
+            cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
+            self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer
+            self.context.set_tensor_address(self.output_name, int(self.d_output))  # output buffer
+
+            self.context.execute_async_v3(stream_handle=self.stream.handle)
+            self.stream.synchronize()
+
+            # Fix: Remove the stream parameter from memcpy_dtoh
+            cuda.memcpy_dtoh(self.h_output, self.d_output)
+
+            output = self.h_output.reshape(self.output_shape)
+            return [output]
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")