[AZ-180] Add Jetson Orin Nano support with INT8 TensorRT engine

- Dockerfile.jetson: JetPack 6.x L4T base image (aarch64), TensorRT and PyCUDA from apt - requirements-jetson.txt: derived from requirements.txt, no pip tensorrt/pycuda - docker-compose.jetson.yml: runtime: nvidia for NVIDIA Container Runtime - tensorrt_engine.pyx: convert_from_source accepts optional calib_cache_path; INT8 used when cache present, FP16 fallback; get_engine_filename encodes precision suffix to avoid engine cache confusion - inference.pyx: init_ai tries INT8 engine then FP16 on lookup; downloads calibration cache before conversion thread; passes cache path through to convert_from_source - constants_inf: add INT8_CALIB_CACHE_FILE constant - Unit tests for AC-3 (INT8 flag set when cache provided) and AC-4 (FP16 when no cache) Made-with: Cursor
2026-04-22 17:46:38 +00:00 · 2026-04-02 07:12:45 +03:00
parent 097811a67b
commit 2149cd6c08
12 changed files with 381 additions and 29 deletions
@@ -4,11 +4,31 @@ import pycuda.driver as cuda  # pyright: ignore[reportMissingImports]
 import pycuda.autoinit  # pyright: ignore[reportMissingImports]
 import pynvml
 import numpy as np
+import os
 cimport constants_inf

 GPU_MEMORY_FRACTION = 0.8


+class _CacheCalibrator(trt.IInt8EntropyCalibrator2):
+    def __init__(self, path):
+        super().__init__()
+        self._path = path
+
+    def get_batch_size(self):
+        return 1
+
+    def get_batch(self, names):
+        return None
+
+    def read_calibration_cache(self):
+        with open(self._path, 'rb') as f:
+            return f.read()
+
+    def write_calibration_cache(self, cache):
+        pass
+
+
 cdef class TensorRTEngine(InferenceEngine):
    def __init__(self, model_bytes: bytes, max_batch_size: int = 8, **kwargs):
        InferenceEngine.__init__(self, model_bytes, max_batch_size, engine_name="tensorrt")
@@ -80,13 +100,16 @@ cdef class TensorRTEngine(InferenceEngine):
        return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory

    @staticmethod
-    def get_engine_filename():
+    def get_engine_filename(str precision="fp16"):
        try:
            from engines import tensor_gpu_index
            device = cuda.Device(max(tensor_gpu_index, 0))
            sm_count = device.multiprocessor_count
            cc_major, cc_minor = device.compute_capability()
-            return f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}.engine"
+            base = f"azaion.cc_{cc_major}.{cc_minor}_sm_{sm_count}"
+            if precision == "int8":
+                return f"{base}.int8.engine"
+            return f"{base}.engine"
        except Exception:
            return None

@@ -96,7 +119,7 @@ cdef class TensorRTEngine(InferenceEngine):
        return constants_inf.AI_ONNX_MODEL_FILE

    @staticmethod
-    def convert_from_source(bytes onnx_model):
+    def convert_from_source(bytes onnx_model, str calib_cache_path=None):
        gpu_mem = TensorRTEngine.get_gpu_memory_bytes(0)
        workspace_bytes = int(gpu_mem * 0.9)

@@ -130,7 +153,13 @@ cdef class TensorRTEngine(InferenceEngine):
                )
                config.add_optimization_profile(profile)

-            if builder.platform_has_fast_fp16:
+            use_int8 = calib_cache_path is not None and os.path.isfile(calib_cache_path)
+            if use_int8:
+                constants_inf.log(<str>'Converting to INT8 with calibration cache')
+                calibrator = _CacheCalibrator(calib_cache_path)
+                config.set_flag(trt.BuilderFlag.INT8)
+                config.int8_calibrator = calibrator
+            elif builder.platform_has_fast_fp16:
                constants_inf.log(<str>'Converting to supported fp16')
                config.set_flag(trt.BuilderFlag.FP16)
            else: