import re import struct import subprocess from pathlib import Path from typing import List, Tuple import json import numpy as np import tensorrt as trt import pycuda.driver as cuda from inference.onnx_engine import InferenceEngine # required for automatically initialize CUDA, do not remove. import pycuda.autoinit import pynvml class TensorRTEngine(InferenceEngine): TRT_LOGGER = trt.Logger(trt.Logger.WARNING) def __init__(self, model_bytes: bytes, **kwargs): try: # metadata_len = struct.unpack(" Tuple[int, int]: return self.input_shape[2], self.input_shape[3] def get_batch_size(self) -> int: return self.batch_size @staticmethod def get_gpu_memory_bytes(device_id=0) -> int: total_memory = None try: pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) total_memory = mem_info.total except pynvml.NVMLError: total_memory = None finally: try: pynvml.nvmlShutdown() except pynvml.NVMLError: pass return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb @staticmethod def get_engine_filename(device_id=0) -> str | None: try: device = cuda.Device(device_id) sm_count = device.multiprocessor_count cc_major, cc_minor = device.compute_capability() return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine" except Exception: return None @staticmethod def convert_from_onnx(onnx_model: bytes) -> bytes | None: workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes() * 0.9) explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) with trt.Builder(TensorRTEngine.TRT_LOGGER) as builder, \ builder.create_network(explicit_batch_flag) as network, \ trt.OnnxParser(network, TensorRTEngine.TRT_LOGGER) as parser, \ builder.create_builder_config() as config: config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) if not parser.parse(onnx_model): return None if builder.platform_has_fast_fp16: print('Converting to supported fp16') config.set_flag(trt.BuilderFlag.FP16) else: print('Converting to supported fp32. (fp16 is not supported)') plan = builder.build_serialized_network(network, config) if plan is None: print('Conversion failed.') return None return bytes(plan) def run(self, input_data: np.ndarray) -> List[np.ndarray]: try: cuda.memcpy_htod_async(self.d_input, input_data, self.stream) self.context.set_tensor_address(self.input_name, int(self.d_input)) # input buffer self.context.set_tensor_address(self.output_name, int(self.d_output)) # output buffer self.context.execute_async_v3(stream_handle=self.stream.handle) self.stream.synchronize() # Fix: Remove the stream parameter from memcpy_dtoh cuda.memcpy_dtoh(self.h_output, self.d_output) output = self.h_output.reshape(self.output_shape) return [output] except Exception as e: raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")