diff --git a/.woodpecker/e2e-convert-jetson.yml b/.woodpecker/e2e-convert-jetson.yml new file mode 100644 index 0000000..9d4f9c3 --- /dev/null +++ b/.woodpecker/e2e-convert-jetson.yml @@ -0,0 +1,30 @@ +when: + - event: [manual] + evaluate: 'E2E_CONVERT_JETSON == "1"' + +labels: + platform: arm64 + +steps: + - name: e2e-convert-jetson + image: docker + environment: + REGISTRY_HOST: + from_secret: registry_host + REGISTRY_USER: + from_secret: registry_user + REGISTRY_TOKEN: + from_secret: registry_token + commands: + - apk add --no-cache bash docker-cli-compose + - echo "$REGISTRY_TOKEN" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin + - cd e2e + - > + E2E_PROFILE=jetson + E2E_WAIT_FOR_ENGINE_ENABLED=1 + E2E_ENGINE_WAIT_TIMEOUT=3600 + E2E_LOG_TAIL=300 + bash run_test.sh tests/test_health_engine.py::TestHealthEngineStep03Warmed + - bash scripts/publish_jetson_engine.sh + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/.woodpecker/e2e-smoke-jetson.yml b/.woodpecker/e2e-smoke-jetson.yml new file mode 100644 index 0000000..d3adc1d --- /dev/null +++ b/.woodpecker/e2e-smoke-jetson.yml @@ -0,0 +1,25 @@ +when: + - event: [manual] + evaluate: 'E2E_CONVERT_JETSON != "1"' + +labels: + platform: arm64 + +steps: + - name: e2e-smoke-jetson + image: docker + environment: + REGISTRY_HOST: + from_secret: registry_host + REGISTRY_USER: + from_secret: registry_user + REGISTRY_TOKEN: + from_secret: registry_token + commands: + - apk add --no-cache bash docker-cli-compose + - echo "$REGISTRY_TOKEN" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin + - cd e2e + - bash scripts/pull_jetson_engine.sh + - E2E_PROFILE=jetson bash run_test.sh tests/test_health_engine.py + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/Dockerfile.jetson b/Dockerfile.jetson index 8da6048..08261b0 100644 --- a/Dockerfile.jetson +++ b/Dockerfile.jetson @@ -13,7 +13,7 @@ WORKDIR /app COPY requirements.txt requirements-jetson.txt ./ RUN pip3 install --no-cache-dir -r requirements-jetson.txt COPY . . -RUN python3 setup.py build_ext --inplace +RUN BUILD_TENSORRT_EXTENSIONS=1 python3 setup.py build_ext --inplace ENV PYTHONPATH=/app/src RUN adduser --disabled-password --no-create-home --gecos "" appuser \ && chown -R appuser /app diff --git a/e2e/docker-compose.test.yml b/e2e/docker-compose.test.yml index 8885f0c..73f6c98 100644 --- a/e2e/docker-compose.test.yml +++ b/e2e/docker-compose.test.yml @@ -2,9 +2,9 @@ name: detections-e2e services: mock-loader: - build: ./mocks/loader - volumes: - - ./fixtures:/models + build: + context: . + dockerfile: mocks/loader/Dockerfile networks: - e2e-net @@ -74,9 +74,7 @@ services: JWT_SECRET: test-secret-e2e-only CLASSES_JSON_PATH: /app/classes.json volumes: - - ./fixtures/classes.json:/app/classes.json:ro - ./fixtures:/media:ro - - ./logs:/app/Logs shm_size: 512m networks: e2e-net: @@ -94,6 +92,7 @@ services: - mock-annotations environment: JWT_SECRET: test-secret-e2e-only + MEDIA_DIR: /app/fixtures volumes: - ./fixtures:/media - ./results:/results diff --git a/e2e/engine-artifact.Dockerfile b/e2e/engine-artifact.Dockerfile new file mode 100644 index 0000000..1d43e4c --- /dev/null +++ b/e2e/engine-artifact.Dockerfile @@ -0,0 +1,5 @@ +FROM alpine:3.20 + +COPY . /models/ + +CMD ["sh"] diff --git a/e2e/fixtures/image_small.jpg b/e2e/fixtures/image_small.jpg new file mode 100644 index 0000000..a693427 Binary files /dev/null and b/e2e/fixtures/image_small.jpg differ diff --git a/e2e/fixtures/models/azaion.onnx b/e2e/fixtures/models/azaion.onnx new file mode 100644 index 0000000..846b6ba Binary files /dev/null and b/e2e/fixtures/models/azaion.onnx differ diff --git a/e2e/mocks/loader/Dockerfile b/e2e/mocks/loader/Dockerfile index 1e4aea5..b60e0e6 100644 --- a/e2e/mocks/loader/Dockerfile +++ b/e2e/mocks/loader/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.11-slim WORKDIR /app RUN pip install --no-cache-dir flask gunicorn -COPY app.py . +COPY mocks/loader/app.py . +COPY fixtures /models EXPOSE 8080 CMD ["gunicorn", "-b", "0.0.0.0:8080", "-w", "1", "--timeout", "120", "app:app"] diff --git a/e2e/mocks/loader/app.py b/e2e/mocks/loader/app.py index 724859a..c46a3bd 100644 --- a/e2e/mocks/loader/app.py +++ b/e2e/mocks/loader/app.py @@ -31,6 +31,16 @@ def _resolve_disk_path(filename: str, folder: str | None) -> Path | None: return None +def _write_disk_path(filename: str, folder: str | None, data: bytes) -> Path: + root = _models_root() + safe_filename = Path(filename).name + target_dir = root / folder if folder else root + target_dir.mkdir(parents=True, exist_ok=True) + target = target_dir / safe_filename + target.write_bytes(data) + return target + + def _should_fail_load() -> bool: global _first_fail_remaining if _mode == "error": @@ -73,7 +83,9 @@ def upload(filename): f = request.files.get("data") if not f: return "", 400 - _uploads[(folder, filename)] = f.read() + data = f.read() + _uploads[(folder, filename)] = data + _write_disk_path(filename, folder, data) _upload_count += 1 return "", 200 diff --git a/e2e/requirements.txt b/e2e/requirements.txt index feb735c..e845f85 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -1,6 +1,7 @@ pytest pytest-csv requests==2.32.4 +PyJWT==2.12.1 sseclient-py pytest-timeout flask diff --git a/e2e/run_test.sh b/e2e/run_test.sh index 671acdd..8f68b0a 100755 --- a/e2e/run_test.sh +++ b/e2e/run_test.sh @@ -19,6 +19,14 @@ case "$PROFILE" in esac COMPOSE="docker compose -f docker-compose.test.yml --profile $PROFILE" +LOG_TAIL="${E2E_LOG_TAIL:-100}" +RUNNER_ENV_ARGS=(-e E2E_PROFILE="$PROFILE") +if [[ "$PROFILE" == "jetson" ]]; then + RUNNER_ENV_ARGS+=( + -e E2E_WAIT_FOR_ENGINE_ENABLED="${E2E_WAIT_FOR_ENGINE_ENABLED:-0}" + -e E2E_ENGINE_WAIT_TIMEOUT="${E2E_ENGINE_WAIT_TIMEOUT:-900}" + ) +fi usage() { echo "Usage: $0 [pytest_args...]" @@ -46,7 +54,7 @@ for i in $(seq 1 60); do fi if [[ "$i" == "60" ]]; then echo "ERROR: detections service did not become healthy" - $COMPOSE logs "$DETECTIONS_SERVICE" --tail 100 + $COMPOSE logs "$DETECTIONS_SERVICE" --tail "$LOG_TAIL" exit 1 fi sleep 2 @@ -54,11 +62,11 @@ done echo "--- Running: pytest $* -v -x -s --csv=/results/report.csv" set +e -$COMPOSE run --rm --no-deps e2e-runner pytest "$@" -v -x -s --csv=/results/report.csv +$COMPOSE run --rm --build --no-deps "${RUNNER_ENV_ARGS[@]}" e2e-runner pytest "$@" -v -x -s --csv=/results/report.csv EXIT_CODE=$? set -e echo "--- Test finished with exit code $EXIT_CODE" -echo "--- Detections logs (last 100 lines):" -$COMPOSE logs "$DETECTIONS_SERVICE" --tail 100 +echo "--- Detections logs (last $LOG_TAIL lines):" +$COMPOSE logs "$DETECTIONS_SERVICE" --tail "$LOG_TAIL" exit $EXIT_CODE diff --git a/e2e/scripts/publish_jetson_engine.sh b/e2e/scripts/publish_jetson_engine.sh new file mode 100644 index 0000000..8b41a64 --- /dev/null +++ b/e2e/scripts/publish_jetson_engine.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +set -euo pipefail + +COMPOSE="${COMPOSE:-docker compose -f docker-compose.test.yml --profile jetson}" +REGISTRY_HOST="${REGISTRY_HOST:?REGISTRY_HOST is required}" +ENGINE_REPOSITORY="${JETSON_ENGINE_REPOSITORY:-$REGISTRY_HOST/azaion/detections-jetson-engine}" +BRANCH="${CI_COMMIT_BRANCH:-local}" +ENGINE_TAG="${JETSON_ENGINE_TAG:-$(printf '%s' "$BRANCH" | tr -c 'A-Za-z0-9_.-' '-')}" +OUT_DIR="${JETSON_ENGINE_OUT_DIR:-results/jetson-engine}" + +mkdir -p "$OUT_DIR/models" + +loader_id="$($COMPOSE ps -q mock-loader)" +if [[ -z "$loader_id" ]]; then + echo "ERROR: mock-loader container is not running" + exit 1 +fi + +docker cp "$loader_id:/models/models/." "$OUT_DIR/models/" +find "$OUT_DIR/models" -maxdepth 1 -type f ! -name 'azaion*.engine' -delete + +engine_count="$(find "$OUT_DIR/models" -maxdepth 1 -type f -name 'azaion*.engine' | wc -l | tr -d ' ')" +if [[ "$engine_count" == "0" ]]; then + echo "ERROR: no converted TensorRT engine found in mock-loader /models/models" + find "$OUT_DIR/models" -maxdepth 2 -type f -print + exit 1 +fi + +echo "--- Converted TensorRT engine files:" +find "$OUT_DIR/models" -maxdepth 1 -type f -name 'azaion*.engine' -print -exec ls -lh {} \; + +image="$ENGINE_REPOSITORY:$ENGINE_TAG" +echo "--- Building Jetson engine artifact image: $image" +docker build -f engine-artifact.Dockerfile -t "$image" "$OUT_DIR/models" +docker push "$image" + +if [[ -n "${CI_COMMIT_SHA:-}" ]]; then + sha_tag="$(printf '%s' "$CI_COMMIT_SHA" | cut -c1-12)" + docker tag "$image" "$ENGINE_REPOSITORY:$sha_tag" + docker push "$ENGINE_REPOSITORY:$sha_tag" +fi + +echo "--- Published Jetson engine artifact image: $image" diff --git a/e2e/scripts/pull_jetson_engine.sh b/e2e/scripts/pull_jetson_engine.sh new file mode 100644 index 0000000..43673ef --- /dev/null +++ b/e2e/scripts/pull_jetson_engine.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ -z "${REGISTRY_HOST:-}" ]]; then + echo "--- REGISTRY_HOST is not set; skipping Jetson engine artifact pull" + exit 0 +fi + +ENGINE_REPOSITORY="${JETSON_ENGINE_REPOSITORY:-$REGISTRY_HOST/azaion/detections-jetson-engine}" +BRANCH="${CI_COMMIT_BRANCH:-local}" +ENGINE_TAG="${JETSON_ENGINE_TAG:-$(printf '%s' "$BRANCH" | tr -c 'A-Za-z0-9_.-' '-')}" +TARGET_DIR="${JETSON_ENGINE_TARGET_DIR:-fixtures/models}" +image="$ENGINE_REPOSITORY:$ENGINE_TAG" + +echo "--- Pulling Jetson engine artifact image: $image" +if ! docker pull "$image"; then + echo "--- Jetson engine artifact image not found; smoke will use ONNX fallback" + exit 0 +fi + +cid="$(docker create "$image")" +trap 'docker rm -f "$cid" >/dev/null 2>&1 || true' EXIT + +mkdir -p "$TARGET_DIR" +docker cp "$cid:/models/." "$TARGET_DIR/" + +echo "--- Installed Jetson engine files:" +find "$TARGET_DIR" -maxdepth 1 -type f -name 'azaion*.engine' -print -exec ls -lh {} \; diff --git a/e2e/tests/test_health_engine.py b/e2e/tests/test_health_engine.py index e160f52..4273175 100644 --- a/e2e/tests/test_health_engine.py +++ b/e2e/tests/test_health_engine.py @@ -1,4 +1,5 @@ import json +import os import threading import time import uuid @@ -7,6 +8,13 @@ import pytest import sseclient _DETECT_TIMEOUT = 60 +_ENGINE_WAIT_TIMEOUT = int(os.environ.get("E2E_ENGINE_WAIT_TIMEOUT", "900")) +_WAIT_FOR_ENGINE_ENABLED = os.environ.get("E2E_WAIT_FOR_ENGINE_ENABLED", "").lower() in ( + "1", + "true", + "yes", + "on", +) def _get_health(http_client): @@ -20,6 +28,24 @@ def _assert_active_ai(data): assert data["aiAvailability"] not in ("None", "Downloading") +def _wait_for_engine_enabled(http_client): + deadline = time.time() + _ENGINE_WAIT_TIMEOUT + last = None + while time.time() < deadline: + last = _get_health(http_client) + availability = last.get("aiAvailability") + if availability == "Enabled": + assert last.get("errorMessage") is None + return last + if availability == "Error": + pytest.fail(f"engine conversion failed: {last.get('errorMessage')}") + time.sleep(3) + pytest.fail( + f"engine did not become Enabled within {_ENGINE_WAIT_TIMEOUT}s " + f"(last health: {last})" + ) + + @pytest.mark.cpu class TestHealthEngineStep01PreInit: def test_ft_p_01_pre_init_health(self, http_client): @@ -92,8 +118,12 @@ class TestHealthEngineStep03Warmed: def _warm(self, warm_engine): pass + @pytest.mark.timeout(_ENGINE_WAIT_TIMEOUT + 30) def test_ft_p_02_post_init_health(self, http_client): - data = _get_health(http_client) + if _WAIT_FOR_ENGINE_ENABLED: + data = _wait_for_engine_enabled(http_client) + else: + data = _get_health(http_client) _assert_active_ai(data) assert data.get("errorMessage") is None diff --git a/requirements-jetson.txt b/requirements-jetson.txt index 5796f85..79fb685 100644 --- a/requirements-jetson.txt +++ b/requirements-jetson.txt @@ -5,7 +5,8 @@ h11==0.16.0 python-multipart==0.0.22 Cython==3.2.4 opencv-python==4.10.0.84 -numpy==2.2.6 +numpy==1.26.4 +onnx==1.17.0 pynvml==12.0.0 requests==2.32.4 loguru==0.7.3 diff --git a/setup.py b/setup.py index 5dcb156..b00d2be 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ from setuptools import setup, Extension from Cython.Build import cythonize import numpy as np +import os SRC = "src" np_inc = [np.get_include(), SRC] @@ -18,16 +19,22 @@ extensions = [ Extension('inference', [f'{SRC}/inference.pyx'], include_dirs=np_inc), ] -try: - import tensorrt # pyright: ignore[reportMissingImports] +build_tensorrt = os.environ.get("BUILD_TENSORRT_EXTENSIONS", "").lower() in ("1", "true", "yes") + +if not build_tensorrt: + try: + import tensorrt # pyright: ignore[reportMissingImports] + build_tensorrt = True + except ImportError: + build_tensorrt = False + +if build_tensorrt: extensions.append( Extension('engines.tensorrt_engine', [f'{SRC}/engines/tensorrt_engine.pyx'], include_dirs=np_inc) ) extensions.append( Extension('engines.jetson_tensorrt_engine', [f'{SRC}/engines/jetson_tensorrt_engine.pyx'], include_dirs=np_inc) ) -except ImportError: - pass setup( name="azaion.detections", diff --git a/src/constants_inf.pxd b/src/constants_inf.pxd index fa3d36d..955da4a 100644 --- a/src/constants_inf.pxd +++ b/src/constants_inf.pxd @@ -12,8 +12,8 @@ cdef str SPLIT_SUFFIX cdef double TILE_DUPLICATE_CONFIDENCE_THRESHOLD cdef int METERS_IN_TILE -cdef log(str log_message) -cdef logerror(str error) +cpdef log(str log_message) +cpdef logerror(str error) cdef format_time(long ms) cdef dict[int, AnnotationClass] annotations_dict diff --git a/src/constants_inf.pyx b/src/constants_inf.pyx index b86baca..412721a 100644 --- a/src/constants_inf.pyx +++ b/src/constants_inf.pyx @@ -78,10 +78,10 @@ def get_annotation_name(int cls_id): return (annotations_dict[cls_id]).name return "" -cdef log(str log_message): +cpdef log(str log_message): logger.info(log_message) -cdef logerror(str error): +cpdef logerror(str error): logger.error(error) cdef format_time(long ms): diff --git a/src/engines/engine_factory.pyx b/src/engines/engine_factory.pyx index df01aa6..beb921c 100644 --- a/src/engines/engine_factory.pyx +++ b/src/engines/engine_factory.pyx @@ -44,6 +44,10 @@ class EngineFactory: def build_and_cache(self, bytes source_bytes, LoaderHttpClient loader_client, str models_dir): cdef LoadResult res engine_bytes, engine_filename = self.build_from_source(source_bytes, loader_client, models_dir) + if engine_bytes is None: + raise RuntimeError("TensorRT conversion failed: no engine bytes produced") + if engine_filename is None: + raise RuntimeError("TensorRT conversion failed: engine filename could not be resolved") res = loader_client.upload_big_small_resource(engine_bytes, engine_filename, models_dir) if res.err is not None: constants_inf.log(f"Failed to upload converted model: {res.err}") @@ -93,6 +97,22 @@ class JetsonTensorRTEngineFactory(TensorRTEngineFactory): from engines.jetson_tensorrt_engine import JetsonTensorRTEngine return JetsonTensorRTEngine(model_bytes) + def load_engine(self, LoaderHttpClient loader_client, str models_dir): + cdef str filename + cdef LoadResult res + from engines.tensorrt_engine import TensorRTEngine + for precision in ("int8", "fp16"): + filename = TensorRTEngine.get_engine_filename(precision) + if filename is None: + continue + try: + res = loader_client.load_big_small_resource(filename, models_dir) + if res.err is None: + return self.create(res.data) + except Exception: + pass + return None + def _get_ai_engine_filename(self): from engines.tensorrt_engine import TensorRTEngine return TensorRTEngine.get_engine_filename("int8") @@ -100,5 +120,5 @@ class JetsonTensorRTEngineFactory(TensorRTEngineFactory): def build_from_source(self, onnx_bytes, LoaderHttpClient loader_client, str models_dir): from engines.jetson_tensorrt_engine import JetsonTensorRTEngine from engines.tensorrt_engine import TensorRTEngine - engine_bytes = JetsonTensorRTEngine.convert_from_source(onnx_bytes, loader_client, models_dir) - return engine_bytes, TensorRTEngine.get_engine_filename("int8") + engine_bytes, precision = JetsonTensorRTEngine.convert_from_source_with_precision(onnx_bytes, loader_client, models_dir) + return engine_bytes, TensorRTEngine.get_engine_filename(precision) diff --git a/src/engines/jetson_tensorrt_engine.pyx b/src/engines/jetson_tensorrt_engine.pyx index cc9fc56..a554d12 100644 --- a/src/engines/jetson_tensorrt_engine.pyx +++ b/src/engines/jetson_tensorrt_engine.pyx @@ -1,5 +1,6 @@ import os import tempfile +cimport constants_inf from engines.tensorrt_engine cimport TensorRTEngine from loader_http_client cimport LoaderHttpClient, LoadResult @@ -7,10 +8,19 @@ from loader_http_client cimport LoaderHttpClient, LoadResult cdef class JetsonTensorRTEngine(TensorRTEngine): @staticmethod def convert_from_source(bytes onnx_model, LoaderHttpClient loader_client, str models_dir): + engine_bytes, precision = JetsonTensorRTEngine.convert_from_source_with_precision( + onnx_model, loader_client, models_dir + ) + return engine_bytes + + @staticmethod + def convert_from_source_with_precision(bytes onnx_model, LoaderHttpClient loader_client, str models_dir): cdef str calib_cache_path calib_cache_path = JetsonTensorRTEngine._download_calib_cache(loader_client, models_dir) try: - return TensorRTEngine.convert_from_source(onnx_model, calib_cache_path) + engine_bytes = TensorRTEngine.convert_from_source(onnx_model, calib_cache_path, True) + precision = "int8" if calib_cache_path is not None else "fp16" + return engine_bytes, precision finally: if calib_cache_path is not None: try: @@ -21,7 +31,6 @@ cdef class JetsonTensorRTEngine(TensorRTEngine): @staticmethod def _download_calib_cache(LoaderHttpClient loader_client, str models_dir): cdef LoadResult res - import constants_inf try: res = loader_client.load_big_small_resource( constants_inf.INT8_CALIB_CACHE_FILE, models_dir diff --git a/src/engines/onnx_tensorrt_compat.py b/src/engines/onnx_tensorrt_compat.py new file mode 100644 index 0000000..5db5ac5 --- /dev/null +++ b/src/engines/onnx_tensorrt_compat.py @@ -0,0 +1,111 @@ +import ast +import io + +import onnx +from onnx import helper, numpy_helper + + +_REDUCE_OPS_WITH_AXES_INPUT = { + "ReduceL1", + "ReduceL2", + "ReduceLogSum", + "ReduceLogSumExp", + "ReduceMax", + "ReduceMean", + "ReduceMin", + "ReduceProd", + "ReduceSum", + "ReduceSumSquare", +} + + +def _metadata(model): + return {p.key: p.value for p in model.metadata_props} + + +def _input_size(model): + try: + imgsz = _metadata(model).get("imgsz") + parsed = ast.literal_eval(imgsz) + if isinstance(parsed, (list, tuple)) and len(parsed) == 2: + h, w = int(parsed[0]), int(parsed[1]) + if h > 0 and w > 0: + return h, w + except Exception: + pass + return 1280, 1280 + + +def _constant_values(graph): + values = {init.name: numpy_helper.to_array(init) for init in graph.initializer} + for node in graph.node: + if node.op_type != "Constant" or not node.output: + continue + for attr in node.attribute: + if attr.name == "value": + values[node.output[0]] = numpy_helper.to_array(attr.t) + break + return values + + +def _as_int_list(value): + if value is None: + return None + if getattr(value, "shape", ()) == (): + return [int(value)] + return [int(v) for v in value.reshape(-1).tolist()] + + +def _set_static_input_shape(model, batch=1): + h, w = _input_size(model) + for graph_input in model.graph.input: + tensor_type = graph_input.type.tensor_type + if tensor_type.elem_type != onnx.TensorProto.FLOAT: + continue + dims = tensor_type.shape.dim + if len(dims) != 4: + continue + for dim, value in zip(dims, (batch, 3, h, w)): + dim.dim_value = value + return True + return False + + +def _rewrite_reduce_axes_inputs(model): + constants = _constant_values(model.graph) + changed = False + for node in model.graph.node: + if node.op_type not in _REDUCE_OPS_WITH_AXES_INPUT or len(node.input) < 2: + continue + axes = _as_int_list(constants.get(node.input[1])) + if axes is None: + continue + kept_attrs = [attr for attr in node.attribute if attr.name != "axes"] + del node.attribute[:] + node.attribute.extend(kept_attrs) + node.attribute.extend([helper.make_attribute("axes", axes)]) + del node.input[1:] + changed = True + return changed + + +def _cap_default_opset(model, max_opset=17): + for opset in model.opset_import: + if opset.domain in ("", "ai.onnx") and opset.version > max_opset: + opset.version = max_opset + return True + return False + + +def prepare_for_tensorrt(model_bytes): + model = onnx.load_model_from_string(model_bytes) + changed = False + changed = _set_static_input_shape(model) or changed + changed = _rewrite_reduce_axes_inputs(model) or changed + changed = _cap_default_opset(model) or changed + if not changed: + return model_bytes + + buffer = io.BytesIO() + onnx.save_model(model, buffer) + return buffer.getvalue() diff --git a/src/engines/tensorrt_engine.pxd b/src/engines/tensorrt_engine.pxd index c44b2a3..84f2b48 100644 --- a/src/engines/tensorrt_engine.pxd +++ b/src/engines/tensorrt_engine.pxd @@ -4,6 +4,8 @@ from engines.inference_engine cimport InferenceEngine cdef class TensorRTEngine(InferenceEngine): cdef public object context + cdef object cuda_context + cdef object cuda_lock cdef public object d_input cdef public object d_output diff --git a/src/engines/tensorrt_engine.pyx b/src/engines/tensorrt_engine.pyx index 3d05cd6..b17b0cb 100644 --- a/src/engines/tensorrt_engine.pyx +++ b/src/engines/tensorrt_engine.pyx @@ -1,10 +1,10 @@ from engines.inference_engine cimport InferenceEngine import tensorrt as trt # pyright: ignore[reportMissingImports] import pycuda.driver as cuda # pyright: ignore[reportMissingImports] -import pycuda.autoinit # pyright: ignore[reportMissingImports] import pynvml import numpy as np import os +import threading cimport constants_inf GPU_MEMORY_FRACTION = 0.8 @@ -32,48 +32,64 @@ class _CacheCalibrator(trt.IInt8EntropyCalibrator2): cdef class TensorRTEngine(InferenceEngine): def __init__(self, model_bytes: bytes, max_batch_size: int = 8, **kwargs): InferenceEngine.__init__(self, model_bytes, max_batch_size, engine_name="tensorrt") + self.cuda_context = TensorRTEngine.create_cuda_context() + self.cuda_lock = threading.Lock() try: - logger = trt.Logger(trt.Logger.WARNING) - runtime = trt.Runtime(logger) - engine = runtime.deserialize_cuda_engine(model_bytes) - if engine is None: - raise RuntimeError("Failed to load TensorRT engine from bytes") + with self.cuda_lock: + self.cuda_context.push() + try: + logger = trt.Logger(trt.Logger.WARNING) + runtime = trt.Runtime(logger) + engine = runtime.deserialize_cuda_engine(model_bytes) + if engine is None: + raise RuntimeError("Failed to load TensorRT engine from bytes") - self.context = engine.create_execution_context() + self.context = engine.create_execution_context() - self.input_name = engine.get_tensor_name(0) - engine_input_shape = engine.get_tensor_shape(self.input_name) + self.input_name = engine.get_tensor_name(0) + engine_input_shape = engine.get_tensor_shape(self.input_name) - C = engine_input_shape[1] - H = 1280 if engine_input_shape[2] == -1 else engine_input_shape[2] - W = 1280 if engine_input_shape[3] == -1 else engine_input_shape[3] + C = engine_input_shape[1] + H = 1280 if engine_input_shape[2] == -1 else engine_input_shape[2] + W = 1280 if engine_input_shape[3] == -1 else engine_input_shape[3] - if engine_input_shape[0] == -1: - gpu_mem = TensorRTEngine.get_gpu_memory_bytes(0) - self.max_batch_size = TensorRTEngine.calculate_max_batch_size(gpu_mem, H, W) - else: - self.max_batch_size = engine_input_shape[0] + if engine_input_shape[0] == -1: + gpu_mem = TensorRTEngine.get_gpu_memory_bytes(0) + self.max_batch_size = TensorRTEngine.calculate_max_batch_size(gpu_mem, H, W) + else: + self.max_batch_size = engine_input_shape[0] - self.input_shape = [self.max_batch_size, C, H, W] - self.context.set_input_shape(self.input_name, self.input_shape) - input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize - self.d_input = cuda.mem_alloc(input_size) + self.input_shape = [self.max_batch_size, C, H, W] + self.context.set_input_shape(self.input_name, self.input_shape) + input_size = trt.volume(self.input_shape) * np.dtype(np.float32).itemsize + self.d_input = cuda.mem_alloc(input_size) - self.output_name = engine.get_tensor_name(1) - engine_output_shape = tuple(engine.get_tensor_shape(self.output_name)) - self.output_shape = [ - self.max_batch_size, - 300 if engine_output_shape[1] == -1 else engine_output_shape[1], - 6 if engine_output_shape[2] == -1 else engine_output_shape[2], - ] - self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32) - self.d_output = cuda.mem_alloc(self.h_output.nbytes) - - self.stream = cuda.Stream() + self.output_name = engine.get_tensor_name(1) + engine_output_shape = tuple(engine.get_tensor_shape(self.output_name)) + self.output_shape = [ + self.max_batch_size, + 300 if engine_output_shape[1] == -1 else engine_output_shape[1], + 6 if engine_output_shape[2] == -1 else engine_output_shape[2], + ] + self.h_output = cuda.pagelocked_empty(tuple(self.output_shape), dtype=np.float32) + self.d_output = cuda.mem_alloc(self.h_output.nbytes) + self.stream = cuda.Stream() + finally: + try: + self.cuda_context.pop() + except Exception: + pass except Exception as e: raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}") + def __dealloc__(self): + try: + if self.cuda_context is not None: + self.cuda_context.detach() + except Exception: + pass + @staticmethod def calculate_max_batch_size(gpu_memory_bytes, int input_h, int input_w): frame_input_bytes = 3 * input_h * input_w * 4 @@ -99,9 +115,18 @@ cdef class TensorRTEngine(InferenceEngine): pass return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory + @staticmethod + def create_cuda_context(): + cuda.init() + from engines import tensor_gpu_index + ctx = cuda.Device(max(tensor_gpu_index, 0)).make_context() + ctx.pop() + return ctx + @staticmethod def get_engine_filename(str precision="fp16"): try: + cuda.init() from engines import tensor_gpu_index device = cuda.Device(max(tensor_gpu_index, 0)) sm_count = device.multiprocessor_count @@ -114,82 +139,114 @@ cdef class TensorRTEngine(InferenceEngine): return None @staticmethod - def convert_from_source(bytes onnx_model, str calib_cache_path=None): + def convert_from_source(bytes onnx_model, str calib_cache_path=None, bint force_static_input=False): + cuda_context = TensorRTEngine.create_cuda_context() + cuda_context.push() gpu_mem = TensorRTEngine.get_gpu_memory_bytes(0) workspace_bytes = int(gpu_mem * 0.9) explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) trt_logger = trt.Logger(trt.Logger.WARNING) - with trt.Builder(trt_logger) as builder, \ - builder.create_network(explicit_batch_flag) as network, \ - trt.OnnxParser(network, trt_logger) as parser, \ - builder.create_builder_config() as config: + if force_static_input: + try: + from engines.onnx_tensorrt_compat import prepare_for_tensorrt + onnx_model = prepare_for_tensorrt(onnx_model) + constants_inf.log('Prepared ONNX model for TensorRT static Jetson build') + except Exception as e: + constants_inf.logerror(f'ONNX TensorRT compatibility preparation failed: {str(e)}') - config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) + try: + with trt.Builder(trt_logger) as builder, \ + builder.create_network(explicit_batch_flag) as network, \ + trt.OnnxParser(network, trt_logger) as parser, \ + builder.create_builder_config() as config: - if not parser.parse(onnx_model): - return None + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_bytes) - input_tensor = network.get_input(0) - shape = input_tensor.shape - C = shape[1] - H = max(shape[2], 1280) if shape[2] != -1 else 1280 - W = max(shape[3], 1280) if shape[3] != -1 else 1280 + if not parser.parse(onnx_model): + for i in range(parser.num_errors): + constants_inf.logerror(f'TensorRT ONNX parser error: {parser.get_error(i)}') + return None - if shape[0] == -1: - max_batch = TensorRTEngine.calculate_max_batch_size(gpu_mem, H, W) - profile = builder.create_optimization_profile() - profile.set_shape( - input_tensor.name, - (1, C, H, W), - (max_batch, C, H, W), - (max_batch, C, H, W), - ) - config.add_optimization_profile(profile) + input_tensor = network.get_input(0) + shape = input_tensor.shape + C = shape[1] + H = max(shape[2], 1280) if shape[2] != -1 else 1280 + W = max(shape[3], 1280) if shape[3] != -1 else 1280 - use_int8 = calib_cache_path is not None and os.path.isfile(calib_cache_path) - if use_int8: - constants_inf.log('Converting to INT8 with calibration cache') - calibrator = _CacheCalibrator(calib_cache_path) - config.set_flag(trt.BuilderFlag.INT8) - if builder.platform_has_fast_fp16: + if force_static_input: + input_tensor.shape = (1, C, H, W) + elif shape[0] == -1 or shape[2] == -1 or shape[3] == -1: + max_batch = TensorRTEngine.calculate_max_batch_size(gpu_mem, H, W) + profile = builder.create_optimization_profile() + profile.set_shape( + input_tensor.name, + (1, C, H, W), + (max_batch, C, H, W), + (max_batch, C, H, W), + ) + config.add_optimization_profile(profile) + + use_int8 = calib_cache_path is not None and os.path.isfile(calib_cache_path) + if use_int8: + constants_inf.log('Converting to INT8 with calibration cache') + calibrator = _CacheCalibrator(calib_cache_path) + config.set_flag(trt.BuilderFlag.INT8) + if builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + config.int8_calibrator = calibrator + elif builder.platform_has_fast_fp16: + constants_inf.log('Converting to supported fp16') config.set_flag(trt.BuilderFlag.FP16) - config.int8_calibrator = calibrator - elif builder.platform_has_fast_fp16: - constants_inf.log('Converting to supported fp16') - config.set_flag(trt.BuilderFlag.FP16) - else: - constants_inf.log('Converting to supported fp32. (fp16 is not supported)') + else: + constants_inf.log('Converting to supported fp32. (fp16 is not supported)') - plan = builder.build_serialized_network(network, config) - if plan is None: - constants_inf.logerror('Conversion failed.') - return None - constants_inf.log('conversion done!') - return bytes(plan) + plan = builder.build_serialized_network(network, config) + if plan is None: + constants_inf.logerror('Conversion failed.') + return None + constants_inf.log('conversion done!') + return bytes(plan) + finally: + try: + cuda_context.pop() + except Exception: + pass + try: + cuda_context.detach() + except Exception: + pass cdef tuple get_input_shape(self): return (self.input_shape[2], self.input_shape[3]) cdef run(self, input_data): try: - actual_batch = input_data.shape[0] - if actual_batch != self.input_shape[0]: - actual_shape = [actual_batch, self.input_shape[1], self.input_shape[2], self.input_shape[3]] - self.context.set_input_shape(self.input_name, actual_shape) + with self.cuda_lock: + self.cuda_context.push() + try: + actual_batch = input_data.shape[0] + if actual_batch != self.input_shape[0]: + actual_shape = [actual_batch, self.input_shape[1], self.input_shape[2], self.input_shape[3]] + self.context.set_input_shape(self.input_name, actual_shape) - cuda.memcpy_htod_async(self.d_input, input_data, self.stream) - self.context.set_tensor_address(self.input_name, int(self.d_input)) - self.context.set_tensor_address(self.output_name, int(self.d_output)) + cuda.memcpy_htod_async(self.d_input, input_data, self.stream) + self.context.set_tensor_address(self.input_name, int(self.d_input)) + self.context.set_tensor_address(self.output_name, int(self.d_output)) - self.context.execute_async_v3(stream_handle=self.stream.handle) - self.stream.synchronize() + self.context.execute_async_v3(stream_handle=self.stream.handle) + self.stream.synchronize() - cuda.memcpy_dtoh(self.h_output, self.d_output) - output_shape = [actual_batch, self.output_shape[1], self.output_shape[2]] - output = self.h_output[:actual_batch].reshape(output_shape) - return [output] + cuda.memcpy_dtoh(self.h_output, self.d_output) + output_shape = [actual_batch, self.output_shape[1], self.output_shape[2]] + output = self.h_output[:actual_batch].reshape(output_shape) + return [output] + finally: + try: + self.cuda_context.pop() + except Exception: + pass except Exception as e: raise RuntimeError(f"Failed to run TensorRT inference: {str(e)}")