Add core functionality for API client, CDN management, and data augmentation

- Introduced `ApiClient` for handling API interactions, including file uploads and downloads. - Implemented `CDNManager` for managing CDN operations with AWS S3. - Added `Augmentator` class for image augmentation, including bounding box corrections and transformations. - Created utility functions for annotation conversion and dataset visualization. - Established a new rules file for sound notifications during human input requests. These additions enhance the system's capabilities for data handling and user interaction, laying the groundwork for future features. Simplify autopilot state file to minimal current-step pointer; add execution safety rule to cursor-meta; remove Completed Steps/Key Decisions/Retry Log/Blockers from state template and all references.
2026-06-21 20:21:15 +00:00 · 2026-03-28 00:12:54 +02:00
parent 142c6c4de8
commit c20018745b
31 changed files with 0 additions and 0 deletions
@@ -0,0 +1,63 @@
+import json
+from enum import Enum
+from os.path import join, dirname
+
+
+class Detection:
+    def __init__(self, x, y, w, h, cls, confidence):
+        self.x = x
+        self.y = y
+        self.w = w
+        self.h = h
+        self.cls = cls
+        self.confidence = confidence
+
+    def overlaps(self, det2, iou_threshold):
+        overlap_x = 0.5 * (self.w + det2.w) - abs(self.x - det2.x)
+        overlap_y = 0.5 * (self.h + det2.h) - abs(self.y - det2.y)
+        intersection = max(0, overlap_x) * max(0, overlap_y)
+        union = self.w * self.h + det2.w * det2.h - intersection
+
+        return intersection / union > iou_threshold
+
+
+class Annotation:
+    def __init__(self, frame, time, detections: list[Detection]):
+        self.frame = frame
+        self.time = time
+        self.detections = detections if detections is not None else []
+
+
+class WeatherMode(Enum):
+    Norm = 0
+    Wint = 20
+    Night = 40
+
+
+class AnnotationClass:
+    def __init__(self, id, name, color):
+        self.id = id
+        self.name = name
+        self.color = color
+        color_str = color.lstrip('#')
+        self.opencv_color = (int(color_str[4:6], 16), int(color_str[2:4], 16), int(color_str[0:2], 16))
+
+    @staticmethod
+    def read_json():
+        classes_path = join(dirname(dirname(__file__)), 'classes.json')
+        with open(classes_path, 'r', encoding='utf-8') as f:
+            j = json.loads(f.read())
+            annotations_dict = {}
+            for mode in WeatherMode:
+                for cl in j:
+                    id = mode.value + cl['Id']
+                    name = cl['Name'] if mode.value == 0 else f'{cl["Name"]}({mode.name})'
+                    annotations_dict[id] = AnnotationClass(id, name, cl['Color'])
+            return annotations_dict
+
+    @property
+    def color_tuple(self):
+        color = self.color[3:]
+        lv = len(color)
+        xx = range(0, lv, lv // 3)
+        return tuple(int(color[i:i + lv // 3], 16) for i in xx)
@@ -0,0 +1,139 @@
+import cv2
+import numpy as np
+from inference.dto import Annotation, Detection, AnnotationClass
+from inference.onnx_engine import InferenceEngine
+
+
+class Inference:
+    def __init__(self, engine: InferenceEngine, confidence_threshold, iou_threshold):
+        self.engine = engine
+        self.confidence_threshold = confidence_threshold
+        self.iou_threshold = iou_threshold
+        self.batch_size = engine.get_batch_size()
+
+        self.model_height, self.model_width = engine.get_input_shape()
+        self.classes = AnnotationClass.read_json()
+
+    def draw(self, annotation: Annotation):
+        img = annotation.frame
+        img_height, img_width = img.shape[:2]
+        for d in annotation.detections:
+            x1 = int(img_width * (d.x - d.w / 2))
+            y1 = int(img_height * (d.y - d.h / 2))
+            x2 = int(x1 + img_width * d.w)
+            y2 = int(y1 + img_height * d.h)
+
+            color = self.classes[d.cls].opencv_color
+            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+            label = f"{self.classes[d.cls].name}: {d.confidence:.2f}"
+            (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+
+            label_y = y1 - 10 if y1 - 10 > label_height else y1 + 10
+
+            cv2.rectangle(
+                img, (x1, label_y - label_height), (x1 + label_width, label_y + label_height), color, cv2.FILLED
+            )
+            cv2.putText(img, label, (x1, label_y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA)
+        cv2.imshow('Video', img)
+
+    def preprocess(self, frames):
+        blobs = [cv2.dnn.blobFromImage(frame,
+                                       scalefactor=1.0 / 255.0,
+                                       size=(self.model_width, self.model_height),
+                                       mean=(0, 0, 0),
+                                       swapRB=True,
+                                       crop=False)
+                 for frame in frames]
+        return np.vstack(blobs)
+
+    def postprocess(self, batch_frames, batch_timestamps, output):
+        anns = []
+        for i in range(len(output[0])):
+            frame = batch_frames[i]
+            timestamp = batch_timestamps[i]
+            detections = []
+            for det in output[0][i]:
+                if det[4] == 0:
+                    break
+                if det[4] < self.confidence_threshold:
+                    continue
+
+                x1 = max(0, det[0] / self.model_width)
+                y1 = max(0, det[1] / self.model_height)
+                x2 = min(1, det[2] / self.model_width)
+                y2 = min(1, det[3] / self.model_height)
+                conf = round(det[4], 2)
+                class_id = int(det[5])
+
+                x = (x1 + x2) / 2
+                y = (y1 + y2) / 2
+                w = x2 - x1
+                h = y2 - y1
+                detections.append(Detection(x, y, w, h, class_id, conf))
+
+            filtered_detections = self.remove_overlapping_detections(detections)
+
+            # if len(filtered_detections) > 0:
+            # _, image = cv2.imencode('.jpg', frame)
+            # image_bytes = image.tobytes()
+            annotation = Annotation(frame, timestamp, filtered_detections)
+            anns.append(annotation)
+        return anns
+
+    def process(self, video):
+        frame_count = 0
+        batch_frames = []
+        batch_timestamps = []
+        v_input = cv2.VideoCapture(video)
+        while v_input.isOpened():
+            ret, frame = v_input.read()
+            if not ret or frame is None:
+                break
+
+            frame_count += 1
+            if frame_count % 4 == 0:
+                batch_frames.append(frame)
+                batch_timestamps.append(int(v_input.get(cv2.CAP_PROP_POS_MSEC)))
+
+            if len(batch_frames) == self.batch_size:
+                input_blob = self.preprocess(batch_frames)
+                outputs = self.engine.run(input_blob)
+                annotations = self.postprocess(batch_frames, batch_timestamps, outputs)
+                for annotation in annotations:
+                    self.draw(annotation)
+                    print(f'video: {annotation.time / 1000:.3f}s')
+                    if cv2.waitKey(1) & 0xFF == ord('q'):
+                        break
+                batch_frames.clear()
+                batch_timestamps.clear()
+
+        if len(batch_frames) > 0:
+            input_blob = self.preprocess(batch_frames)
+            outputs = self.engine.run(input_blob)
+            annotations = self.postprocess(batch_frames, batch_timestamps, outputs)
+            for annotation in annotations:
+                self.draw(annotation)
+                print(f'video: {annotation.time / 1000:.3f}s')
+                if cv2.waitKey(1) & 0xFF == ord('q'):
+                    break
+
+    def remove_overlapping_detections(self, detections):
+        filtered_output = []
+        filtered_out_indexes = []
+
+        for det1_index in range(len(detections)):
+            if det1_index in filtered_out_indexes:
+                continue
+            det1 = detections[det1_index]
+            res = det1_index
+            for det2_index in range(det1_index + 1, len(detections)):
+                det2 = detections[det2_index]
+                if det1.overlaps(det2, self.iou_threshold):
+                    if det1.confidence > det2.confidence or (det1.confidence == det2.confidence and det1.cls < det2.cls):
+                        filtered_out_indexes.append(det2_index)
+                    else:
+                        filtered_out_indexes.append(res)
+                        res = det2_index
+            filtered_output.append(detections[res])
+            filtered_out_indexes.append(res)
+        return filtered_output
@@ -0,0 +1,47 @@
+import abc
+from typing import List, Tuple
+import numpy as np
+import onnxruntime as onnx
+
+
+class InferenceEngine(abc.ABC):
+    @abc.abstractmethod
+    def __init__(self, model_path: str, batch_size: int = 1, **kwargs):
+        pass
+
+    @abc.abstractmethod
+    def get_input_shape(self) -> Tuple[int, int]:
+        pass
+
+    @abc.abstractmethod
+    def get_batch_size(self) -> int:
+        pass
+
+    @abc.abstractmethod
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        pass
+
+
+class OnnxEngine(InferenceEngine):
+    def __init__(self, model_bytes, batch_size: int = 1, **kwargs):
+        self.batch_size = batch_size
+        self.session = onnx.InferenceSession(model_bytes, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        self.model_inputs = self.session.get_inputs()
+        self.input_name = self.model_inputs[0].name
+        self.input_shape = self.model_inputs[0].shape
+        if self.input_shape[0] != -1:
+            self.batch_size = self.input_shape[0]
+        model_meta = self.session.get_modelmeta()
+        print("Metadata:", model_meta.custom_metadata_map)
+        self.class_names = eval(model_meta.custom_metadata_map["names"])
+        pass
+
+    def get_input_shape(self) -> Tuple[int, int]:
+        shape = self.input_shape
+        return shape[2], shape[3]
+
+    def get_batch_size(self) -> int:
+        return self.batch_size
+
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        return self.session.run(None, {self.input_name: input_data})
@@ -0,0 +1,148 @@
+import re
+import struct
+import subprocess
+from pathlib import Path
+from typing import List, Tuple
+import json
+import numpy as np
+import tensorrt as trt
+import pycuda.driver as cuda
+from inference.onnx_engine import InferenceEngine
+# required for automatically initialize CUDA, do not remove.
+import pycuda.autoinit
+import pynvml
+
+
+class TensorRTEngine(InferenceEngine):
+    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+
+    def __init__(self, model_bytes: bytes, **kwargs):
+        try:
+            # metadata_len = struct.unpack("<I", model_bytes[:4])[0]
+            # try:
+            #     self.metadata = json.loads(model_bytes[4:4 + metadata_len])
+            #     self.class_names = self.metadata['names']
+            #     print(f"Model metadata: {json.dumps(self.metadata, indent=2)}")
+            # except json.JSONDecodeError as err:
+            #     print(f"Failed to parse metadata")
+            #     return
+            # engine_data = model_bytes[4 + metadata_len:]
+
+            runtime = trt.Runtime(self.TRT_LOGGER)
+            self.engine = runtime.deserialize_cuda_engine(model_bytes)
+
+            if self.engine is None:
+                raise RuntimeError(f"Failed to load TensorRT engine!")
+
+            self.context = self.engine.create_execution_context()
+
+            # input
+            self.input_name = self.engine.get_tensor_name(0)
+            engine_input_shape = self.engine.get_tensor_shape(self.input_name)
+            if engine_input_shape[0] != -1:
+                self.batch_size = engine_input_shape[0]
+            self.input_shape = [
+                self.batch_size,
+                engine_input_shape[1],  # Channels (usually fixed at 3 for RGB)
+                1280 if engine_input_shape[2] == -1 else engine_input_shape[2],  # Height
+                1280 if engine_input_shape[3] == -1 else engine_input_shape[3]  # Width
+            ]
+            self.context.set_input_shape(self.input_name, self.input_shape)
+            input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize
+            self.d_input = cuda.mem_alloc(input_size)
+
+            # output
+            self.output_name = self.engine.get_tensor_name(1)
+            engine_output_shape = tuple(self.engine.get_tensor_shape(self.output_name))
+            self.output_shape = [
+                4 if self.input_shape[0] == -1 else self.input_shape[0], # by default, batch size is 4
+                300 if engine_output_shape[1] == -1 else engine_output_shape[1],  # max detections number
+                6 if engine_output_shape[2] == -1 else engine_output_shape[2]  # x1 y1 x2 y2 conf cls
+            ]
+            self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32)
+            self.d_output = cuda.mem_alloc(self.h_output.nbytes)
+
+            self.stream = cuda.Stream()
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
+
+    def get_input_shape(self) -> Tuple[int, int]:
+        return self.input_shape[2], self.input_shape[3]
+
+    def get_batch_size(self) -> int:
+        return self.batch_size
+
+    @staticmethod
+    def get_gpu_memory_bytes(device_id=0) -> int:
+        total_memory = None
+        try:
+            pynvml.nvmlInit()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_memory = mem_info.total
+        except pynvml.NVMLError:
+            total_memory = None
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except pynvml.NVMLError:
+                pass
+        return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
+
+    @staticmethod
+    def get_engine_filename(device_id=0) -> str | None:
+        try:
+            device = cuda.Device(device_id)
+            sm_count = device.multiprocessor_count
+            cc_major, cc_minor = device.compute_capability()
+            return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine"
+        except Exception:
+            return None
+
+    @staticmethod
+    def convert_from_onnx(onnx_model: bytes) -> bytes | None:
+        workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes() * 0.9)
+
+        explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+        with trt.Builder(TensorRTEngine.TRT_LOGGER) as builder, \
+                builder.create_network(explicit_batch_flag) as network, \
+                trt.OnnxParser(network, TensorRTEngine.TRT_LOGGER) as parser, \
+                builder.create_builder_config() as config:
+
+            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
+
+            if not parser.parse(onnx_model):
+                return None
+
+            if builder.platform_has_fast_fp16:
+                print('Converting to supported fp16')
+                config.set_flag(trt.BuilderFlag.FP16)
+            else:
+                print('Converting to supported fp32. (fp16 is not supported)')
+            plan = builder.build_serialized_network(network, config)
+
+            if plan is None:
+                print('Conversion failed.')
+                return None
+
+            return bytes(plan)
+
+    def run(self, input_data: np.ndarray) -> List[np.ndarray]:
+        try:
+            cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
+            self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer
+            self.context.set_tensor_address(self.output_name, int(self.d_output))  # output buffer
+
+            self.context.execute_async_v3(stream_handle=self.stream.handle)
+            self.stream.synchronize()
+
+            # Fix: Remove the stream parameter from memcpy_dtoh
+            cuda.memcpy_dtoh(self.h_output, self.d_output)
+
+            output = self.h_output.reshape(self.output_shape)
+            return [output]
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")