Refactor inference engine and task management: Remove obsolete inference engine and ONNX engine files, update inference processing to utilize batch handling, and enhance task management structure in documentation. Adjust paths for task specifications to align with new directory organization.

This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-03-28 01:04:28 +02:00
parent 1e4ef299f9
commit 5be53739cd
60 changed files with 111875 additions and 208 deletions
View File
+32
View File
@@ -0,0 +1,32 @@
def _check_tensor_gpu_index():
try:
import pynvml
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
if device_count == 0:
return -1
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
if major > 6 or (major == 6 and minor >= 1):
return i
return -1
except Exception:
return -1
finally:
try:
import pynvml
pynvml.nvmlShutdown()
except Exception:
pass
tensor_gpu_index = _check_tensor_gpu_index()
def create_engine(model_bytes: bytes, batch_size: int = 1):
if tensor_gpu_index > -1:
from engines.tensorrt_engine import TensorRTEngine
return TensorRTEngine(model_bytes, batch_size)
from engines.onnx_engine import OnnxEngine
return OnnxEngine(model_bytes, batch_size)
File diff suppressed because it is too large Load Diff
+13
View File
@@ -0,0 +1,13 @@
from engines.inference_engine cimport InferenceEngine
cdef class CoreMLEngine(InferenceEngine):
cdef object model
cdef str input_name
cdef tuple input_shape
cdef list _output_names
cdef tuple get_input_shape(self)
cdef int get_batch_size(self)
cdef run(self, input_data)
+49
View File
@@ -0,0 +1,49 @@
from engines.inference_engine cimport InferenceEngine
cimport constants_inf
import numpy as np
cdef class CoreMLEngine(InferenceEngine):
def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
super().__init__(model_bytes, batch_size)
import coremltools as ct
model_path = kwargs.get('model_path')
if model_path is None:
raise ValueError(
"CoreMLEngine requires model_path kwarg "
"pointing to a .mlpackage or .mlmodel")
self.model = ct.models.MLModel(
model_path, compute_units=ct.ComputeUnit.ALL)
spec = self.model.get_spec()
input_desc = spec.description.input[0]
self.input_name = input_desc.name
array_type = input_desc.type.multiArrayType
self.input_shape = tuple(int(s) for s in array_type.shape)
if len(self.input_shape) == 4:
self.batch_size = self.input_shape[0] if self.input_shape[0] > 0 else batch_size
self._output_names = [o.name for o in spec.description.output]
constants_inf.log(<str>f'CoreML model: input={self.input_name} shape={self.input_shape}')
constants_inf.log(<str>f'CoreML outputs: {self._output_names}')
cdef tuple get_input_shape(self):
return self.input_shape[2], self.input_shape[3]
cdef int get_batch_size(self):
return self.batch_size
cdef run(self, input_data):
prediction = self.model.predict({self.input_name: input_data})
results = []
for name in self._output_names:
val = prediction[name]
if not isinstance(val, np.ndarray):
val = np.array(val)
results.append(val)
return results
File diff suppressed because it is too large Load Diff
+9
View File
@@ -0,0 +1,9 @@
from typing import List, Tuple
import numpy as np
cdef class InferenceEngine:
cdef public int batch_size
cdef tuple get_input_shape(self)
cdef int get_batch_size(self)
cdef run(self, input_data)
+12
View File
@@ -0,0 +1,12 @@
cdef class InferenceEngine:
def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
self.batch_size = batch_size
cdef tuple get_input_shape(self):
raise NotImplementedError("Subclass must implement get_input_shape")
cdef int get_batch_size(self):
return self.batch_size
cdef run(self, input_data):
raise NotImplementedError("Subclass must implement run")
+13491
View File
File diff suppressed because it is too large Load Diff
+14
View File
@@ -0,0 +1,14 @@
from engines.inference_engine cimport InferenceEngine
cdef class OnnxEngine(InferenceEngine):
cdef public object session
cdef object _cpu_session
cdef object model_inputs
cdef str input_name
cdef object input_shape
cdef tuple get_input_shape(self)
cdef int get_batch_size(self)
cdef run(self, input_data)
+50
View File
@@ -0,0 +1,50 @@
from engines.inference_engine cimport InferenceEngine
import onnxruntime as onnx
cimport constants_inf
import os
def _select_providers():
available = set(onnx.get_available_providers())
skip_coreml = os.environ.get("SKIP_COREML", "").lower() in ("1", "true", "yes")
preferred = ["CoreMLExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
if skip_coreml:
preferred = [p for p in preferred if p != "CoreMLExecutionProvider"]
selected = [p for p in preferred if p in available]
return selected or ["CPUExecutionProvider"]
cdef class OnnxEngine(InferenceEngine):
def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs):
super().__init__(model_bytes, batch_size)
providers = _select_providers()
constants_inf.log(<str>f'ONNX providers: {providers}')
self.session = onnx.InferenceSession(model_bytes, providers=providers)
self.model_inputs = self.session.get_inputs()
self.input_name = self.model_inputs[0].name
self.input_shape = self.model_inputs[0].shape
self.batch_size = self.input_shape[0] if self.input_shape[0] != -1 else batch_size
constants_inf.log(f'AI detection model input: {self.model_inputs} {self.input_shape}')
model_meta = self.session.get_modelmeta()
constants_inf.log(f"Metadata: {model_meta.custom_metadata_map}")
self._cpu_session = None
if any("CoreML" in p for p in self.session.get_providers()):
constants_inf.log(<str>'CoreML active — creating CPU fallback session')
self._cpu_session = onnx.InferenceSession(
model_bytes, providers=["CPUExecutionProvider"])
cdef tuple get_input_shape(self):
shape = self.input_shape
return shape[2], shape[3]
cdef int get_batch_size(self):
return self.batch_size
cdef run(self, input_data):
try:
return self.session.run(None, {self.input_name: input_data})
except Exception:
if self._cpu_session is not None:
return self._cpu_session.run(None, {self.input_name: input_data})
raise
+24
View File
@@ -0,0 +1,24 @@
from engines.inference_engine cimport InferenceEngine
cdef class TensorRTEngine(InferenceEngine):
cdef public object context
cdef public object d_input
cdef public object d_output
cdef str input_name
cdef object input_shape
cdef object h_output
cdef str output_name
cdef object output_shape
cdef object stream
cdef tuple get_input_shape(self)
cdef int get_batch_size(self)
cdef run(self, input_data)
+136
View File
@@ -0,0 +1,136 @@
from engines.inference_engine cimport InferenceEngine
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # required for automatically initialize CUDA, do not remove.
import pynvml
import numpy as np
cimport constants_inf
cdef class TensorRTEngine(InferenceEngine):
def __init__(self, model_bytes: bytes, batch_size: int = 4, **kwargs):
super().__init__(model_bytes, batch_size)
try:
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(model_bytes)
if engine is None:
raise RuntimeError(f"Failed to load TensorRT engine from bytes")
self.context = engine.create_execution_context()
# input
self.input_name = engine.get_tensor_name(0)
engine_input_shape = engine.get_tensor_shape(self.input_name)
if engine_input_shape[0] != -1:
self.batch_size = engine_input_shape[0]
else:
self.batch_size = batch_size
self.input_shape = [
self.batch_size,
engine_input_shape[1], # Channels (usually fixed at 3 for RGB)
1280 if engine_input_shape[2] == -1 else engine_input_shape[2], # Height
1280 if engine_input_shape[3] == -1 else engine_input_shape[3] # Width
]
self.context.set_input_shape(self.input_name, self.input_shape)
input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize
self.d_input = cuda.mem_alloc(input_size)
# output
self.output_name = engine.get_tensor_name(1)
engine_output_shape = tuple(engine.get_tensor_shape(self.output_name))
self.output_shape = [
self.batch_size,
300 if engine_output_shape[1] == -1 else engine_output_shape[1], # max detections number
6 if engine_output_shape[2] == -1 else engine_output_shape[2] # x1 y1 x2 y2 conf cls
]
self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32)
self.d_output = cuda.mem_alloc(self.h_output.nbytes)
self.stream = cuda.Stream()
except Exception as e:
raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
@staticmethod
def get_gpu_memory_bytes(int device_id):
total_memory = None
try:
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
total_memory = mem_info.total
except pynvml.NVMLError:
total_memory = None
finally:
try:
pynvml.nvmlShutdown()
except pynvml.NVMLError:
pass
return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
@staticmethod
def get_engine_filename(int device_id):
try:
device = cuda.Device(device_id)
sm_count = device.multiprocessor_count
cc_major, cc_minor = device.compute_capability()
return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine"
except Exception:
return None
@staticmethod
def convert_from_onnx(bytes onnx_model):
workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9)
explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
trt_logger = trt.Logger(trt.Logger.WARNING)
with trt.Builder(trt_logger) as builder, \
builder.create_network(explicit_batch_flag) as network, \
trt.OnnxParser(network, trt_logger) as parser, \
builder.create_builder_config() as config:
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes)
if not parser.parse(onnx_model):
return None
if builder.platform_has_fast_fp16:
constants_inf.log(<str>'Converting to supported fp16')
config.set_flag(trt.BuilderFlag.FP16)
else:
constants_inf.log(<str>'Converting to supported fp32. (fp16 is not supported)')
plan = builder.build_serialized_network(network, config)
if plan is None:
constants_inf.logerror(<str>'Conversion failed.')
return None
constants_inf.log('conversion done!')
return bytes(plan)
cdef tuple get_input_shape(self):
return self.input_shape[2], self.input_shape[3]
cdef int get_batch_size(self):
return self.batch_size
cdef run(self, input_data):
try:
cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
self.context.set_tensor_address(self.input_name, int(self.d_input)) # input buffer
self.context.set_tensor_address(self.output_name, int(self.d_output)) # output buffer
self.context.execute_async_v3(stream_handle=self.stream.handle)
self.stream.synchronize()
# Fix: Remove the stream parameter from memcpy_dtoh
cuda.memcpy_dtoh(self.h_output, self.d_output)
output = self.h_output.reshape(self.output_shape)
return [output]
except Exception as e:
raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")