Refactor inference engine and task management: Remove obsolete inference engine and ONNX engine files, update inference processing to utilize batch handling, and enhance task management structure in documentation. Adjust paths for task specifications to align with new directory organization.

2026-04-22 20:46:31 +00:00 · 2026-03-28 01:04:28 +02:00
parent 1e4ef299f9
commit 5be53739cd
60 changed files with 111875 additions and 208 deletions
@@ -0,0 +1,32 @@
+def _check_tensor_gpu_index():
+    try:
+        import pynvml
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        if device_count == 0:
+            return -1
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            if major > 6 or (major == 6 and minor >= 1):
+                return i
+        return -1
+    except Exception:
+        return -1
+    finally:
+        try:
+            import pynvml
+            pynvml.nvmlShutdown()
+        except Exception:
+            pass
+
+
+tensor_gpu_index = _check_tensor_gpu_index()
+
+
+def create_engine(model_bytes: bytes, batch_size: int = 1):
+    if tensor_gpu_index > -1:
+        from engines.tensorrt_engine import TensorRTEngine
+        return TensorRTEngine(model_bytes, batch_size)
+    from engines.onnx_engine import OnnxEngine
+    return OnnxEngine(model_bytes, batch_size)
@@ -0,0 +1,13 @@
+from engines.inference_engine cimport InferenceEngine
+
+
+cdef class CoreMLEngine(InferenceEngine):
+
+    cdef object model
+    cdef str input_name
+    cdef tuple input_shape
+    cdef list _output_names
+
+    cdef tuple get_input_shape(self)
+    cdef int get_batch_size(self)
+    cdef run(self, input_data)
@@ -0,0 +1,49 @@
+from engines.inference_engine cimport InferenceEngine
+cimport constants_inf
+import numpy as np
+
+
+cdef class CoreMLEngine(InferenceEngine):
+
+    def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
+        super().__init__(model_bytes, batch_size)
+        import coremltools as ct
+
+        model_path = kwargs.get('model_path')
+        if model_path is None:
+            raise ValueError(
+                "CoreMLEngine requires model_path kwarg "
+                "pointing to a .mlpackage or .mlmodel")
+
+        self.model = ct.models.MLModel(
+            model_path, compute_units=ct.ComputeUnit.ALL)
+        spec = self.model.get_spec()
+
+        input_desc = spec.description.input[0]
+        self.input_name = input_desc.name
+
+        array_type = input_desc.type.multiArrayType
+        self.input_shape = tuple(int(s) for s in array_type.shape)
+        if len(self.input_shape) == 4:
+            self.batch_size = self.input_shape[0] if self.input_shape[0] > 0 else batch_size
+
+        self._output_names = [o.name for o in spec.description.output]
+
+        constants_inf.log(<str>f'CoreML model: input={self.input_name} shape={self.input_shape}')
+        constants_inf.log(<str>f'CoreML outputs: {self._output_names}')
+
+    cdef tuple get_input_shape(self):
+        return self.input_shape[2], self.input_shape[3]
+
+    cdef int get_batch_size(self):
+        return self.batch_size
+
+    cdef run(self, input_data):
+        prediction = self.model.predict({self.input_name: input_data})
+        results = []
+        for name in self._output_names:
+            val = prediction[name]
+            if not isinstance(val, np.ndarray):
+                val = np.array(val)
+            results.append(val)
+        return results
@@ -0,0 +1,9 @@
+from typing import List, Tuple
+import numpy as np
+
+
+cdef class InferenceEngine:
+    cdef public int batch_size
+    cdef tuple get_input_shape(self)
+    cdef int get_batch_size(self)
+    cdef run(self, input_data)
@@ -0,0 +1,12 @@
+cdef class InferenceEngine:
+    def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
+        self.batch_size = batch_size
+
+    cdef tuple get_input_shape(self):
+        raise NotImplementedError("Subclass must implement get_input_shape")
+
+    cdef int get_batch_size(self):
+        return self.batch_size
+
+    cdef run(self, input_data):
+        raise NotImplementedError("Subclass must implement run")
@@ -0,0 +1,14 @@
+from engines.inference_engine cimport InferenceEngine
+
+
+cdef class OnnxEngine(InferenceEngine):
+
+    cdef public object session
+    cdef object _cpu_session
+    cdef object model_inputs
+    cdef str input_name
+    cdef object input_shape
+
+    cdef tuple get_input_shape(self)
+    cdef int get_batch_size(self)
+    cdef run(self, input_data)
@@ -0,0 +1,50 @@
+from engines.inference_engine cimport InferenceEngine
+import onnxruntime as onnx
+cimport constants_inf
+
+import os
+
+def _select_providers():
+    available = set(onnx.get_available_providers())
+    skip_coreml = os.environ.get("SKIP_COREML", "").lower() in ("1", "true", "yes")
+    preferred = ["CoreMLExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
+    if skip_coreml:
+        preferred = [p for p in preferred if p != "CoreMLExecutionProvider"]
+    selected = [p for p in preferred if p in available]
+    return selected or ["CPUExecutionProvider"]
+
+cdef class OnnxEngine(InferenceEngine):
+    def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
+        super().__init__(model_bytes, batch_size)
+
+        providers = _select_providers()
+        constants_inf.log(<str>f'ONNX providers: {providers}')
+        self.session = onnx.InferenceSession(model_bytes, providers=providers)
+        self.model_inputs = self.session.get_inputs()
+        self.input_name = self.model_inputs[0].name
+        self.input_shape = self.model_inputs[0].shape
+        self.batch_size = self.input_shape[0] if self.input_shape[0] != -1 else batch_size
+        constants_inf.log(f'AI detection model input: {self.model_inputs} {self.input_shape}')
+        model_meta = self.session.get_modelmeta()
+        constants_inf.log(f"Metadata: {model_meta.custom_metadata_map}")
+
+        self._cpu_session = None
+        if any("CoreML" in p for p in self.session.get_providers()):
+            constants_inf.log(<str>'CoreML active — creating CPU fallback session')
+            self._cpu_session = onnx.InferenceSession(
+                model_bytes, providers=["CPUExecutionProvider"])
+
+    cdef tuple get_input_shape(self):
+        shape = self.input_shape
+        return shape[2], shape[3]
+
+    cdef int get_batch_size(self):
+        return self.batch_size
+
+    cdef run(self, input_data):
+        try:
+            return self.session.run(None, {self.input_name: input_data})
+        except Exception:
+            if self._cpu_session is not None:
+                return self._cpu_session.run(None, {self.input_name: input_data})
+            raise
@@ -0,0 +1,24 @@
+from engines.inference_engine cimport InferenceEngine
+
+
+cdef class TensorRTEngine(InferenceEngine):
+
+    cdef public object context
+
+    cdef public object d_input
+    cdef public object d_output
+    cdef str input_name
+    cdef object input_shape
+
+    cdef object h_output
+    cdef str output_name
+    cdef object output_shape
+
+    cdef object stream
+
+
+    cdef tuple get_input_shape(self)
+
+    cdef int get_batch_size(self)
+
+    cdef run(self, input_data)
@@ -0,0 +1,136 @@
+from engines.inference_engine cimport InferenceEngine
+import tensorrt as trt
+import pycuda.driver as cuda
+import pycuda.autoinit  # required for automatically initialize CUDA, do not remove.
+import pynvml
+import numpy as np
+cimport constants_inf
+
+
+cdef class TensorRTEngine(InferenceEngine):
+    def __init__(self, model_bytes: bytes, batch_size: int = 4, **kwargs):
+        super().__init__(model_bytes, batch_size)
+        try:
+            logger = trt.Logger(trt.Logger.WARNING)
+
+            runtime = trt.Runtime(logger)
+            engine = runtime.deserialize_cuda_engine(model_bytes)
+
+            if engine is None:
+                raise RuntimeError(f"Failed to load TensorRT engine from bytes")
+
+            self.context = engine.create_execution_context()
+
+            # input
+            self.input_name = engine.get_tensor_name(0)
+            engine_input_shape = engine.get_tensor_shape(self.input_name)
+            if engine_input_shape[0] != -1:
+                self.batch_size = engine_input_shape[0]
+            else:
+                self.batch_size = batch_size
+
+            self.input_shape = [
+                self.batch_size,
+                engine_input_shape[1],  # Channels (usually fixed at 3 for RGB)
+                1280 if engine_input_shape[2] == -1 else engine_input_shape[2],  # Height
+                1280 if engine_input_shape[3] == -1 else engine_input_shape[3]  # Width
+            ]
+            self.context.set_input_shape(self.input_name, self.input_shape)
+            input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize
+            self.d_input = cuda.mem_alloc(input_size)
+
+            # output
+            self.output_name = engine.get_tensor_name(1)
+            engine_output_shape = tuple(engine.get_tensor_shape(self.output_name))
+            self.output_shape = [
+                self.batch_size,
+                300 if engine_output_shape[1] == -1 else engine_output_shape[1],  # max detections number
+                6 if engine_output_shape[2] == -1 else engine_output_shape[2]  # x1 y1 x2 y2 conf cls
+            ]
+            self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32)
+            self.d_output = cuda.mem_alloc(self.h_output.nbytes)
+
+            self.stream = cuda.Stream()
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
+
+    @staticmethod
+    def get_gpu_memory_bytes(int device_id):
+        total_memory = None
+        try:
+            pynvml.nvmlInit()
+            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_memory = mem_info.total
+        except pynvml.NVMLError:
+            total_memory = None
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except pynvml.NVMLError:
+                pass
+        return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
+
+    @staticmethod
+    def get_engine_filename(int device_id):
+        try:
+            device = cuda.Device(device_id)
+            sm_count = device.multiprocessor_count
+            cc_major, cc_minor = device.compute_capability()
+            return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine"
+        except Exception:
+            return None
+
+    @staticmethod
+    def convert_from_onnx(bytes onnx_model):
+        workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9)
+
+        explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        trt_logger = trt.Logger(trt.Logger.WARNING)
+
+        with trt.Builder(trt_logger) as builder, \
+                builder.create_network(explicit_batch_flag) as network, \
+                trt.OnnxParser(network, trt_logger) as parser, \
+                builder.create_builder_config() as config:
+
+            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
+
+            if not parser.parse(onnx_model):
+                return None
+
+            if builder.platform_has_fast_fp16:
+                constants_inf.log(<str>'Converting to supported fp16')
+                config.set_flag(trt.BuilderFlag.FP16)
+            else:
+                constants_inf.log(<str>'Converting to supported fp32. (fp16 is not supported)')
+            plan = builder.build_serialized_network(network, config)
+
+            if plan is None:
+                constants_inf.logerror(<str>'Conversion failed.')
+                return None
+            constants_inf.log('conversion done!')
+            return bytes(plan)
+
+    cdef tuple get_input_shape(self):
+        return self.input_shape[2], self.input_shape[3]
+
+    cdef int get_batch_size(self):
+        return self.batch_size
+
+    cdef run(self, input_data):
+        try:
+            cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
+            self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer
+            self.context.set_tensor_address(self.output_name, int(self.d_output))  # output buffer
+
+            self.context.execute_async_v3(stream_handle=self.stream.handle)
+            self.stream.synchronize()
+
+            # Fix: Remove the stream parameter from memcpy_dtoh
+            cuda.memcpy_dtoh(self.h_output, self.d_output)
+            output = self.h_output.reshape(self.output_shape)
+            return [output]
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")