diff --git a/Azaion.CommonSecurity/Services/InferenceClient.cs b/Azaion.CommonSecurity/Services/InferenceClient.cs index 7fc307f..afbc0dd 100644 --- a/Azaion.CommonSecurity/Services/InferenceClient.cs +++ b/Azaion.CommonSecurity/Services/InferenceClient.cs @@ -27,6 +27,7 @@ public class InferenceClient : IInferenceClient { _inferenceClientConfig = config.Value; Start(); + _ = Task.Run(ProcessClientCommands); } private void Start() @@ -57,6 +58,11 @@ public class InferenceClient : IInferenceClient _dealer.Connect($"tcp://{_inferenceClientConfig.ZeroMqHost}:{_inferenceClientConfig.ZeroMqPort}"); } + private async Task ProcessClientCommands() + { + + } + public void Stop() { if (!_dealer.IsDisposed) diff --git a/Azaion.Inference/azaion-inference.spec b/Azaion.Inference/azaion-inference.spec index 017e6b8..80ecf15 100644 --- a/Azaion.Inference/azaion-inference.spec +++ b/Azaion.Inference/azaion-inference.spec @@ -1,11 +1,11 @@ # -*- mode: python ; coding: utf-8 -*- +from PyInstaller.utils.hooks import collect_submodules from PyInstaller.utils.hooks import collect_all -datas = [] +datas = [('venv\\Lib\\site-packages\\cv2', 'cv2')] binaries = [] hiddenimports = ['constants', 'annotation', 'credentials', 'file_data', 'user', 'security', 'secure_model', 'cdn_manager', 'api_client', 'hardware_service', 'remote_command', 'ai_config', 'inference_engine', 'inference', 'remote_command_handler'] -tmp_ret = collect_all('jwt') -datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] +hiddenimports += collect_submodules('cv2') tmp_ret = collect_all('requests') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('psutil') @@ -16,8 +16,6 @@ tmp_ret = collect_all('zmq') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('cryptography') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] -tmp_ret = collect_all('cv2') -datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('numpy') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('onnxruntime') @@ -28,8 +26,6 @@ tmp_ret = collect_all('pycuda') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('pynvml') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] -tmp_ret = collect_all('re') -datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] tmp_ret = collect_all('boto3') datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2] diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx index 2a0c6e6..d505e60 100644 --- a/Azaion.Inference/inference.pyx +++ b/Azaion.Inference/inference.pyx @@ -2,6 +2,7 @@ import json import mimetypes import os import subprocess +import sys import time import cv2 @@ -11,10 +12,15 @@ cimport constants from remote_command cimport RemoteCommand from annotation cimport Detection, Annotation from ai_config cimport AIRecognitionConfig -from inference_engine cimport OnnxEngine, TensorRTEngine from hardware_service cimport HardwareService from security cimport Security +if HardwareService.has_nvidia_gpu(): + from tensorrt_engine cimport TensorRTEngine +else: + from onnx_engine import OnnxEngine + + cdef class Inference: def __init__(self, api_client, on_annotation): self.api_client = api_client @@ -31,17 +37,20 @@ cdef class Inference: if not is_nvidia: return - engine_filename = TensorRTEngine.get_engine_filename() + engine_filename = TensorRTEngine.get_engine_filename(0) key = Security.get_model_encryption_key() models_dir = constants.MODELS_FOLDER if not os.path.exists(os.path.join( models_dir, f'{engine_filename}.big')): + #TODO: Check cdn on engine exists, if there is, download self.is_building_engine = True - time.sleep(5) # prevent simultaneously loading dll and models + time.sleep(8) # prevent simultaneously loading dll and models onnx_model = self.api_client.load_big_small_resource(constants.AI_ONNX_MODEL_FILE, models_dir, key) model_bytes = TensorRTEngine.convert_from_onnx(onnx_model) self.api_client.upload_big_small_resource(model_bytes, engine_filename, models_dir, key) print('uploaded ') self.is_building_engine = False + else: + print('tensor rt engine is here, no need to build') cdef init_ai(self): @@ -54,10 +63,9 @@ cdef class Inference: if is_nvidia: while self.is_building_engine: time.sleep(1) - engine_filename = TensorRTEngine.get_engine_filename() + engine_filename = TensorRTEngine.get_engine_filename(0) model_bytes = self.api_client.load_big_small_resource(engine_filename, models_dir, key) self.engine = TensorRTEngine(model_bytes) - else: model_bytes = self.api_client.load_big_small_resource(constants.AI_ONNX_MODEL_FILE, models_dir, key) self.engine = OnnxEngine(model_bytes) diff --git a/Azaion.Inference/inference_engine.pxd b/Azaion.Inference/inference_engine.pxd index 15799a0..73680d6 100644 --- a/Azaion.Inference/inference_engine.pxd +++ b/Azaion.Inference/inference_engine.pxd @@ -6,30 +6,4 @@ cdef class InferenceEngine: cdef public int batch_size cdef tuple get_input_shape(self) cdef int get_batch_size(self) - cpdef run(self, input_data) - -cdef class OnnxEngine(InferenceEngine): - cdef object session - cdef list model_inputs - cdef str input_name - cdef object input_shape - -cdef class TensorRTEngine(InferenceEngine): - cdef object stream - cdef object context - cdef str input_name - cdef str output_name - cdef object d_input - cdef object d_output - cdef object input_shape - cdef object output_shape - cdef object h_output - - @staticmethod - cdef bytes convert_from_onnx(bytes onnx_model) - - @staticmethod - cdef unsigned long long get_gpu_memory_bytes(device_id=?) - - @staticmethod - cdef str get_engine_filename(device_id=?) \ No newline at end of file + cdef run(self, input_data) diff --git a/Azaion.Inference/inference_engine.pyx b/Azaion.Inference/inference_engine.pyx index 0353111..9a83be8 100644 --- a/Azaion.Inference/inference_engine.pyx +++ b/Azaion.Inference/inference_engine.pyx @@ -1,14 +1,3 @@ -import json -import struct -from typing import List, Tuple -import numpy as np -import onnxruntime as onnx -import tensorrt as trt -import pycuda.driver as cuda -import pycuda.autoinit # required for automatically initialize CUDA, do not remove. -import pynvml -cimport constants - cdef class InferenceEngine: def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs): self.batch_size = batch_size @@ -19,159 +8,5 @@ cdef class InferenceEngine: cdef int get_batch_size(self): return self.batch_size - cpdef run(self, input_data): + cdef run(self, input_data): raise NotImplementedError("Subclass must implement run") - - -cdef class OnnxEngine(InferenceEngine): - def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs): - super().__init__(model_bytes, batch_size) - - - self.session = onnx.InferenceSession(model_bytes, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) - self.model_inputs = self.session.get_inputs() - self.input_name = self.model_inputs[0].name - self.input_shape = self.model_inputs[0].shape - self.batch_size = self.input_shape[0] if self.input_shape[0] != -1 else batch_size - print(f'AI detection model input: {self.model_inputs} {self.input_shape}') - model_meta = self.session.get_modelmeta() - print("Metadata:", model_meta.custom_metadata_map) - - cdef tuple get_input_shape(self): - shape = self.input_shape - return shape[2], shape[3] - - cdef int get_batch_size(self): - return self.batch_size - - cpdef run(self, input_data): - return self.session.run(None, {self.input_name: input_data}) - - -cdef class TensorRTEngine(InferenceEngine): - def __init__(self, model_bytes: bytes, batch_size: int = 4, **kwargs): - super().__init__(model_bytes, batch_size) - try: - logger = trt.Logger(trt.Logger.WARNING) - - runtime = trt.Runtime(logger) - engine = runtime.deserialize_cuda_engine(model_bytes) - - if engine is None: - raise RuntimeError(f"Failed to load TensorRT engine from bytes") - - self.context = engine.create_execution_context() - - # input - self.input_name = engine.get_tensor_name(0) - engine_input_shape = engine.get_tensor_shape(self.input_name) - if engine_input_shape[0] != -1: - self.batch_size = engine_input_shape[0] - else: - self.batch_size = batch_size - - self.input_shape = [ - self.batch_size, - engine_input_shape[1], # Channels (usually fixed at 3 for RGB) - 1280 if engine_input_shape[2] == -1 else engine_input_shape[2], # Height - 1280 if engine_input_shape[3] == -1 else engine_input_shape[3] # Width - ] - self.context.set_input_shape(self.input_name, self.input_shape) - input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize - self.d_input = cuda.mem_alloc(input_size) - - # output - self.output_name = engine.get_tensor_name(1) - engine_output_shape = tuple(engine.get_tensor_shape(self.output_name)) - self.output_shape = [ - self.batch_size, - 300 if engine_output_shape[1] == -1 else engine_output_shape[1], # max detections number - 6 if engine_output_shape[2] == -1 else engine_output_shape[2] # x1 y1 x2 y2 conf cls - ] - self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32) - self.d_output = cuda.mem_alloc(self.h_output.nbytes) - - self.stream = cuda.Stream() - - except Exception as e: - raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}") - - @staticmethod - cdef unsigned long long get_gpu_memory_bytes(device_id=0): - total_memory = None - try: - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) - mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) - total_memory = mem_info.total - except pynvml.NVMLError: - total_memory = None - finally: - try: - pynvml.nvmlShutdown() - except pynvml.NVMLError: - pass - return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb - - @staticmethod - cdef str get_engine_filename(device_id=0): - try: - device = cuda.Device(device_id) - sm_count = device.multiprocessor_count - cc_major, cc_minor = device.compute_capability() - return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine" - except Exception: - return None - - @staticmethod - cdef bytes convert_from_onnx(bytes onnx_model): - cdef unsigned long long workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes() * 0.9) - - explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) - trt_logger = trt.Logger(trt.Logger.WARNING) - - with trt.Builder(trt_logger) as builder, \ - builder.create_network(explicit_batch_flag) as network, \ - trt.OnnxParser(network, trt_logger) as parser, \ - builder.create_builder_config() as config: - - config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) - - if not parser.parse(onnx_model): - return None - - if builder.platform_has_fast_fp16: - constants.log('Converting to supported fp16') - config.set_flag(trt.BuilderFlag.FP16) - else: - print('Converting to supported fp32. (fp16 is not supported)') - plan = builder.build_serialized_network(network, config) - - if plan is None: - print('Conversion failed.') - return None - constants.log('conversion done!') - return bytes(plan) - - cdef tuple get_input_shape(self): - return self.input_shape[2], self.input_shape[3] - - cdef int get_batch_size(self): - return self.batch_size - - cpdef run(self, input_data): - try: - cuda.memcpy_htod_async(self.d_input, input_data, self.stream) - self.context.set_tensor_address(self.input_name, int(self.d_input)) # input buffer - self.context.set_tensor_address(self.output_name, int(self.d_output)) # output buffer - - self.context.execute_async_v3(stream_handle=self.stream.handle) - self.stream.synchronize() - - # Fix: Remove the stream parameter from memcpy_dtoh - cuda.memcpy_dtoh(self.h_output, self.d_output) - output = self.h_output.reshape(self.output_shape) - return [output] - - except Exception as e: - raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}") \ No newline at end of file diff --git a/Azaion.Inference/onnx_engine.pyx b/Azaion.Inference/onnx_engine.pyx new file mode 100644 index 0000000..ed8cdeb --- /dev/null +++ b/Azaion.Inference/onnx_engine.pyx @@ -0,0 +1,25 @@ +from inference_engine cimport InferenceEngine +import onnxruntime as onnx + +cdef class OnnxEngine(InferenceEngine): + def __init__(self, model_bytes: bytes, batch_size: int = 1, **kwargs): + super().__init__(model_bytes, batch_size) + + self.session = onnx.InferenceSession(model_bytes, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) + self.model_inputs = self.session.get_inputs() + self.input_name = self.model_inputs[0].name + self.input_shape = self.model_inputs[0].shape + self.batch_size = self.input_shape[0] if self.input_shape[0] != -1 else batch_size + print(f'AI detection model input: {self.model_inputs} {self.input_shape}') + model_meta = self.session.get_modelmeta() + print("Metadata:", model_meta.custom_metadata_map) + + cdef tuple get_input_shape(self): + shape = self.input_shape + return shape[2], shape[3] + + cdef int get_batch_size(self): + return self.batch_size + + cpdef run(self, input_data): + return self.session.run(None, {self.input_name: input_data}) \ No newline at end of file diff --git a/Azaion.Inference/remote_command.pxd b/Azaion.Inference/remote_command.pxd index 8817795..014ece0 100644 --- a/Azaion.Inference/remote_command.pxd +++ b/Azaion.Inference/remote_command.pxd @@ -12,3 +12,5 @@ cdef class RemoteCommand: @staticmethod cdef from_msgpack(bytes data) + + cdef bytes serialize(self) diff --git a/Azaion.Inference/remote_command.pyx b/Azaion.Inference/remote_command.pyx index 5109387..a053685 100644 --- a/Azaion.Inference/remote_command.pyx +++ b/Azaion.Inference/remote_command.pyx @@ -20,3 +20,9 @@ cdef class RemoteCommand: cdef from_msgpack(bytes data): unpacked = msgpack.unpackb(data, strict_map_key=False) return RemoteCommand(unpacked.get("CommandType"), unpacked.get("Data")) + + cdef bytes serialize(self): + return msgpack.packb({ + "CommandType": self.command_type, + "Data": self.data + }) diff --git a/Azaion.Inference/requirements.txt b/Azaion.Inference/requirements.txt index b7dddb5..b353bfa 100644 --- a/Azaion.Inference/requirements.txt +++ b/Azaion.Inference/requirements.txt @@ -3,7 +3,6 @@ Cython opencv-python==4.10.0.84 numpy onnxruntime-gpu -onnx cryptography psutil msgpack diff --git a/Azaion.Inference/setup.py b/Azaion.Inference/setup.py index ca2e038..ee669b0 100644 --- a/Azaion.Inference/setup.py +++ b/Azaion.Inference/setup.py @@ -15,6 +15,8 @@ extensions = [ Extension('cdn_manager', ['cdn_manager.pyx']), Extension('api_client', ['api_client.pyx']), Extension('ai_config', ['ai_config.pyx']), + Extension('tensorrt_engine', ['tensorrt_engine.pyx'], include_dirs=[np.get_include()]), + Extension('onnx_engine', ['onnx_engine.pyx'], include_dirs=[np.get_include()]), Extension('inference_engine', ['inference_engine.pyx'], include_dirs=[np.get_include()]), Extension('inference', ['inference.pyx'], include_dirs=[np.get_include()]), Extension('main', ['main.pyx']), diff --git a/Azaion.Inference/tensorrt_engine.pxd b/Azaion.Inference/tensorrt_engine.pxd new file mode 100644 index 0000000..90d8b79 --- /dev/null +++ b/Azaion.Inference/tensorrt_engine.pxd @@ -0,0 +1,32 @@ +from inference_engine cimport InferenceEngine + + +cdef class TensorRTEngine(InferenceEngine): + + cdef public object context + + cdef public object d_input + cdef public object d_output + cdef str input_name + cdef object input_shape + + cdef object h_output + cdef str output_name + cdef object output_shape + + cdef object stream + + @staticmethod + cdef get_gpu_memory_bytes(int device_id) + + @staticmethod + cdef get_engine_filename(int device_id) + + @staticmethod + cdef convert_from_onnx(bytes onnx_model) + + cdef tuple get_input_shape(self) + + cdef int get_batch_size(self) + + cdef run(self, input_data) diff --git a/Azaion.Inference/tensorrt_engine.pyx b/Azaion.Inference/tensorrt_engine.pyx new file mode 100644 index 0000000..c7cf1a5 --- /dev/null +++ b/Azaion.Inference/tensorrt_engine.pyx @@ -0,0 +1,136 @@ +from inference_engine cimport InferenceEngine +import tensorrt as trt +import pycuda.driver as cuda +import pycuda.autoinit # required for automatically initialize CUDA, do not remove. +import pynvml +import numpy as np +cimport constants + + +cdef class TensorRTEngine(InferenceEngine): + def __init__(self, model_bytes: bytes, batch_size: int = 4, **kwargs): + super().__init__(model_bytes, batch_size) + try: + logger = trt.Logger(trt.Logger.WARNING) + + runtime = trt.Runtime(logger) + engine = runtime.deserialize_cuda_engine(model_bytes) + + if engine is None: + raise RuntimeError(f"Failed to load TensorRT engine from bytes") + + self.context = engine.create_execution_context() + + # input + self.input_name = engine.get_tensor_name(0) + engine_input_shape = engine.get_tensor_shape(self.input_name) + if engine_input_shape[0] != -1: + self.batch_size = engine_input_shape[0] + else: + self.batch_size = batch_size + + self.input_shape = [ + self.batch_size, + engine_input_shape[1], # Channels (usually fixed at 3 for RGB) + 1280 if engine_input_shape[2] == -1 else engine_input_shape[2], # Height + 1280 if engine_input_shape[3] == -1 else engine_input_shape[3] # Width + ] + self.context.set_input_shape(self.input_name, self.input_shape) + input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize + self.d_input = cuda.mem_alloc(input_size) + + # output + self.output_name = engine.get_tensor_name(1) + engine_output_shape = tuple(engine.get_tensor_shape(self.output_name)) + self.output_shape = [ + self.batch_size, + 300 if engine_output_shape[1] == -1 else engine_output_shape[1], # max detections number + 6 if engine_output_shape[2] == -1 else engine_output_shape[2] # x1 y1 x2 y2 conf cls + ] + self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32) + self.d_output = cuda.mem_alloc(self.h_output.nbytes) + + self.stream = cuda.Stream() + + except Exception as e: + raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}") + + @staticmethod + cdef get_gpu_memory_bytes(int device_id): + total_memory = None + try: + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + total_memory = mem_info.total + except pynvml.NVMLError: + total_memory = None + finally: + try: + pynvml.nvmlShutdown() + except pynvml.NVMLError: + pass + return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb + + @staticmethod + cdef get_engine_filename(int device_id): + try: + device = cuda.Device(device_id) + sm_count = device.multiprocessor_count + cc_major, cc_minor = device.compute_capability() + return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine" + except Exception: + return None + + @staticmethod + cdef convert_from_onnx(bytes onnx_model): + workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9) + + explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + trt_logger = trt.Logger(trt.Logger.WARNING) + + with trt.Builder(trt_logger) as builder, \ + builder.create_network(explicit_batch_flag) as network, \ + trt.OnnxParser(network, trt_logger) as parser, \ + builder.create_builder_config() as config: + + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) + + if not parser.parse(onnx_model): + return None + + if builder.platform_has_fast_fp16: + constants.log('Converting to supported fp16') + config.set_flag(trt.BuilderFlag.FP16) + else: + print('Converting to supported fp32. (fp16 is not supported)') + plan = builder.build_serialized_network(network, config) + + if plan is None: + print('Conversion failed.') + return None + constants.log('conversion done!') + return bytes(plan) + + cdef tuple get_input_shape(self): + return self.input_shape[2], self.input_shape[3] + + cdef int get_batch_size(self): + return self.batch_size + + cdef run(self, input_data): + try: + cuda.memcpy_htod_async(self.d_input, input_data, self.stream) + self.context.set_tensor_address(self.input_name, int(self.d_input)) # input buffer + self.context.set_tensor_address(self.output_name, int(self.d_output)) # output buffer + + self.context.execute_async_v3(stream_handle=self.stream.handle) + self.stream.synchronize() + + # Fix: Remove the stream parameter from memcpy_dtoh + cuda.memcpy_dtoh(self.h_output, self.d_output) + output = self.h_output.reshape(self.output_shape) + return [output] + + except Exception as e: + raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")