gps-denied-onboard/tests/unit/c7_inference/test_onnx_trt_ep_runtime.py

"""AZ-299 — :class:`OnnxTrtEpRuntime` acceptance tests.

The real ORT TRT EP path (provider negotiation, session creation,
warm-up, TRT subgraph compile) requires onnxruntime + a Tier-2 Jetson
host; those tests are guarded by :data:`_REQUIRE_ORT` and skip cleanly
on Tier-1 / macOS dev. CPU-runnable coverage uses fake ORT
:class:`InferenceSession` shims to verify:

- Protocol conformance + label (AC-1).
- ``.onnx`` deserialise skips the gate (AC-2).
- ``.engine`` deserialise invokes the gate first (AC-3).
- ``infer`` round-trips through ``session.run`` and returns named
  outputs (AC-4) + the input-binding-missing rewrap.
- First-infer fallback WARN log + GCS callback + FDR record fire
  exactly once (AC-5).
- TRT EP options carry the workspace cap from
  ``config.gpu_memory_budget_bytes // 4`` (AC-8).
- ``release_engine`` is idempotent (AC-7).
- NFR-reliability — ORT internal exceptions rewrap to
  :class:`InferenceError`.
- CPU-fallback handling (Risk 2) emits the FDR + log; the hard-refusal
  toggle raises :class:`EngineDeserializeError`.
"""

from __future__ import annotations

import hashlib
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock

import numpy as np
import pytest

from gps_denied_onboard._types.inference import (
    BuildConfig,
    EngineCacheEntry,
    OptimizationProfile,
    PrecisionMode,
)
from gps_denied_onboard._types.thermal import ThermalState
from gps_denied_onboard.components.c7_inference import (
    C7InferenceConfig,
    DeploymentManifest,
    EngineDeserializeError,
    EngineSchemaMismatchError,
    HostTuple,
    InferenceError,
    InferenceRuntime,
    OutOfMemoryError,
)
from gps_denied_onboard.components.c7_inference.onnx_trt_ep_runtime import (
    CPU_EP,
    CUDA_EP,
    ENGINE_SUFFIX,
    ONNX_SUFFIX,
    OnnxTrtEpEngineHandle,
    OnnxTrtEpRuntime,
    TRT_EP,
    _sha256_of_file,
)
from gps_denied_onboard.config.schema import Config
from gps_denied_onboard.fdr_client.client import FdrClient
from gps_denied_onboard.fdr_client.records import (
    FdrRecord,
)
from gps_denied_onboard.helpers.sha256_sidecar import (
    SIDECAR_SUFFIX,
)

try:
    import onnxruntime  # type: ignore[import-not-found]  # noqa: F401

    _HAS_ORT = True
except ImportError:
    _HAS_ORT = False

_REQUIRE_ORT = pytest.mark.skipif(
    not _HAS_ORT,
    reason="onnxruntime not installed (Tier-2 Jetson / JetPack 6.2 only)",
)

_TIER2_REASON = (
    "AZ-299 Tier-2 microbench harness owns the real-ORT perf / numerical "
    "asserts (C7-IT-05); skipped on Tier-1 CI / macOS dev."
)


_TIER2_HOST = HostTuple(sm=87, jp="6.2", trt="10.3", precision=PrecisionMode.FP16)


# ----------------------------------------------------------------------
# Fakes.


class _FakeIoTensor:
    """Minimal stand-in for ``onnxruntime.NodeArg``."""

    def __init__(self, name: str, shape: tuple[int, ...], dtype: str) -> None:
        self.name = name
        self.shape = list(shape)
        self.type = dtype


class _FakeOrtSession:
    """Hand-rolled :class:`onnxruntime.InferenceSession` substitute."""

    def __init__(
        self,
        *,
        active_providers: tuple[str, ...] = (TRT_EP, CUDA_EP, CPU_EP),
        inputs: tuple[_FakeIoTensor, ...] = (
            _FakeIoTensor("x", (1, 3, 224, 224), "tensor(float)"),
        ),
        outputs: tuple[_FakeIoTensor, ...] = (
            _FakeIoTensor("y", (1, 1000), "tensor(float)"),
        ),
        run_side_effect: Any | None = None,
    ) -> None:
        self._active_providers = active_providers
        self._inputs = inputs
        self._outputs = outputs
        self.run_calls: list[tuple[list[str] | None, dict[str, np.ndarray]]] = []
        self.profiling_ended = 0
        self._run_side_effect = run_side_effect

    def get_providers(self) -> list[str]:
        return list(self._active_providers)

    def get_inputs(self) -> list[_FakeIoTensor]:
        return list(self._inputs)

    def get_outputs(self) -> list[_FakeIoTensor]:
        return list(self._outputs)

    def run(
        self,
        output_names: list[str] | None,
        feed: dict[str, np.ndarray],
    ) -> list[np.ndarray]:
        self.run_calls.append((output_names, feed))
        if self._run_side_effect is not None:
            raise self._run_side_effect
        result: list[np.ndarray] = []
        for out in self._outputs:
            shape = tuple(d if d > 0 else 1 for d in out.shape)
            result.append(np.zeros(shape, dtype=np.float32))
        return result

    def end_profiling(self) -> None:
        self.profiling_ended += 1


class _FakeOrt:
    """Replaces the lazy-imported ``onnxruntime`` module."""

    def __init__(self, session: _FakeOrtSession) -> None:
        self._session = session
        self.session_kwargs: dict[str, Any] = {}
        self.session_path: str | None = None
        self.create_side_effect: Any | None = None

    def InferenceSession(
        self,
        path: str,
        *,
        providers: list[str],
        provider_options: list[dict[str, Any]],
    ) -> _FakeOrtSession:
        self.session_path = path
        self.session_kwargs = {
            "providers": providers,
            "provider_options": provider_options,
        }
        if self.create_side_effect is not None:
            raise self.create_side_effect
        return self._session


# ----------------------------------------------------------------------
# Fixtures.


@pytest.fixture
def config(tmp_path: Path) -> Config:
    return Config.with_blocks(
        c7_inference=C7InferenceConfig(
            runtime="onnx_trt_ep",
            engine_cache_dir=str(tmp_path / "engines"),
            ort_trt_cache_dir=str(tmp_path / "ort_cache"),
        )
    )


@pytest.fixture
def runtime_basic(config: Config) -> OnnxTrtEpRuntime:
    return OnnxTrtEpRuntime(config)


def _make_engine_entry(
    tmp_path: Path,
    *,
    sm: int = 87,
    jp: str = "6.2",
    trt: str = "10.3",
    precision: PrecisionMode = PrecisionMode.FP16,
    payload: bytes = b"fake-engine-bytes",
) -> tuple[EngineCacheEntry, Path]:
    name = f"ultravpr__sm{sm}_jp{jp}_trt{trt}_{precision.value}.engine"
    engine_path = tmp_path / name
    engine_path.write_bytes(payload)
    sha_hex = hashlib.sha256(payload).hexdigest()
    Path(str(engine_path) + SIDECAR_SUFFIX).write_text(sha_hex, encoding="utf-8")
    entry = EngineCacheEntry(
        engine_path=engine_path,
        sha256_hex=sha_hex,
        sm=sm,
        jp=jp,
        trt=trt,
        precision=precision,
        extras={},
    )
    return entry, engine_path


def _make_onnx_entry(tmp_path: Path) -> tuple[EngineCacheEntry, Path]:
    onnx_path = tmp_path / "ultravpr.onnx"
    onnx_path.write_bytes(b"\x08\x07")  # not a real ONNX, but file exists.
    sha_hex = hashlib.sha256(b"\x08\x07").hexdigest()
    entry = EngineCacheEntry(
        engine_path=onnx_path,
        sha256_hex=sha_hex,
        sm=87,
        jp="6.2",
        trt="10.3",
        precision=PrecisionMode.FP16,
        extras={"model_name": onnx_path.stem},
    )
    return entry, onnx_path


def _manifest_for(engine_path: Path) -> DeploymentManifest:
    sha_hex = hashlib.sha256(engine_path.read_bytes()).hexdigest()
    return DeploymentManifest(
        root=engine_path.parent,
        entries={engine_path.name: sha_hex},
    )


# ----------------------------------------------------------------------
# AC-1: Protocol + label.


def test_ac1_protocol_conformance(runtime_basic: OnnxTrtEpRuntime) -> None:
    assert isinstance(runtime_basic, InferenceRuntime)
    assert runtime_basic.current_runtime_label() == "onnx_trt_ep"


# ----------------------------------------------------------------------
# AC-2: deserialize from .onnx skips the gate.


def test_ac2_onnx_path_skips_gate(
    tmp_path: Path, config: Config
) -> None:
    # Arrange
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)

    gate = MagicMock()  # would raise AssertionError if called
    runtime = OnnxTrtEpRuntime(
        config,
        engine_gate=gate,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    # Act
    handle = runtime.deserialize_engine(entry)
    # Assert
    gate.validate.assert_not_called()
    assert isinstance(handle, OnnxTrtEpEngineHandle)
    assert handle._active_provider == TRT_EP
    assert handle._model_name == onnx_path.stem
    assert fake_ort.session_path == str(onnx_path)
    assert fake_ort.session_kwargs["providers"] == [TRT_EP, CUDA_EP, CPU_EP]
    # warm-up call recorded.
    assert len(fake_session.run_calls) == 1


# ----------------------------------------------------------------------
# AC-3: deserialize from .engine invokes the gate first.


def test_ac3_engine_path_invokes_gate_first(
    tmp_path: Path, config: Config
) -> None:
    # Arrange — engine filename says sm=86 but host is sm=87 → gate refuses.
    entry, engine_path = _make_engine_entry(tmp_path, sm=86)

    class _NoSessionOrt:
        def InferenceSession(self, *args: Any, **kwargs: Any) -> Any:
            raise AssertionError(
                "AC-3: ORT session must NOT be created when gate refuses"
            )

    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(engine_path),
    )
    runtime._load_ort = lambda: _NoSessionOrt()  # type: ignore[method-assign]
    # Act / Assert
    with pytest.raises(EngineSchemaMismatchError, match="sm=86"):
        runtime.deserialize_engine(entry)


def test_ac3_engine_path_passes_gate_then_creates_session(
    tmp_path: Path, config: Config
) -> None:
    # Arrange
    entry, engine_path = _make_engine_entry(tmp_path, sm=87)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(engine_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    # Act
    handle = runtime.deserialize_engine(entry)
    # Assert
    assert isinstance(handle, OnnxTrtEpEngineHandle)
    assert fake_ort.session_path == str(engine_path)
    # Engine file should have been staged under the ORT cache dir.
    staged = Path(config.components["c7_inference"].ort_trt_cache_dir) / engine_path.name
    assert staged.exists()


# ----------------------------------------------------------------------
# AC-4: infer round-trips and returns named outputs.


def test_ac4_infer_round_trip_named_outputs(
    tmp_path: Path, config: Config
) -> None:
    # Arrange
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    fake_session.run_calls.clear()
    # Act
    outputs = runtime.infer(
        handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)}
    )
    # Assert
    assert set(outputs.keys()) == {"y"}
    assert outputs["y"].shape == (1, 1000)
    assert len(fake_session.run_calls) == 1
    requested_outputs, feed = fake_session.run_calls[0]
    assert requested_outputs == ["y"]
    assert "x" in feed and feed["x"].shape == (1, 3, 224, 224)


def test_ac4_infer_missing_input_binding_rewraps(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    with pytest.raises(InferenceError, match="missing input binding"):
        runtime.infer(handle, {})


# ----------------------------------------------------------------------
# AC-5: fallback WARN + gcs_alert + FDR fire exactly once.


def test_ac5_first_infer_fallback_alert_fires_once(
    tmp_path: Path, config: Config, caplog: pytest.LogCaptureFixture
) -> None:
    # Arrange
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    gcs_calls: list[str] = []
    fdr = FdrClient(producer_id="c7_inference.onnx_trt_ep", capacity=16)
    runtime = OnnxTrtEpRuntime(
        config,
        is_fallback=True,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
        fdr_client=fdr,
        gcs_alert=lambda msg: gcs_calls.append(msg),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    # Act — first infer.
    caplog.set_level("WARNING", logger="c7_inference.onnx_trt_ep")
    runtime.infer(handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)})
    # Assert — exactly one WARN log on the fallback kind.
    warn_records = [
        r for r in caplog.records
        if r.levelname == "WARNING" and "fallback" in r.getMessage()
    ]
    assert len(warn_records) == 1
    assert len(gcs_calls) == 1
    # FDR drained: one c7.fallback_to_onnx_trt_ep record.
    fdr_records = fdr.drain(max_records=4)
    fallback_records = [
        r for r in fdr_records if r.kind == "c7.fallback_to_onnx_trt_ep"
    ]
    assert len(fallback_records) == 1
    rec = fallback_records[0]
    assert rec.payload["model_name"] == onnx_path.stem
    assert rec.payload["active_provider"] == TRT_EP
    # Act — second infer should be silent.
    caplog.clear()
    runtime.infer(handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)})
    further_warns = [r for r in caplog.records if r.levelname == "WARNING"]
    assert further_warns == []
    assert len(gcs_calls) == 1
    further_records = fdr.drain(max_records=4)
    assert all(
        r.kind != "c7.fallback_to_onnx_trt_ep" for r in further_records
    )


def test_ac5_non_fallback_runtime_is_silent(
    tmp_path: Path,
    config: Config,
    caplog: pytest.LogCaptureFixture,
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    fdr = FdrClient(producer_id="c7_inference.onnx_trt_ep", capacity=16)
    gcs_calls: list[str] = []
    runtime = OnnxTrtEpRuntime(
        config,
        is_fallback=False,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
        fdr_client=fdr,
        gcs_alert=lambda msg: gcs_calls.append(msg),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    caplog.set_level("WARNING", logger="c7_inference.onnx_trt_ep")
    runtime.infer(handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)})
    fallback_warns = [
        r for r in caplog.records if "fallback" in r.getMessage()
    ]
    assert fallback_warns == []
    assert gcs_calls == []


# ----------------------------------------------------------------------
# AC-6: provider fallback chain (TRT refuses → CUDA active; label unchanged).


def test_ac6_active_provider_is_first_get_providers_entry(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession(
        active_providers=(CUDA_EP, CPU_EP)
    )
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    assert handle._active_provider == CUDA_EP
    assert runtime.current_runtime_label() == "onnx_trt_ep"


# ----------------------------------------------------------------------
# AC-7: release_engine idempotent.


def test_ac7_release_engine_idempotent(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    # Act — first release.
    runtime.release_engine(handle)
    assert handle._released is True
    assert handle._session is None
    assert fake_session.profiling_ended == 1
    # Act — second release is a no-op.
    runtime.release_engine(handle)
    assert fake_session.profiling_ended == 1


def test_release_engine_ignores_foreign_handle(
    runtime_basic: OnnxTrtEpRuntime,
) -> None:
    class _Foreign:
        pass

    runtime_basic.release_engine(_Foreign())  # type: ignore[arg-type]


# ----------------------------------------------------------------------
# AC-8: TRT EP options carry the budget // 4 workspace cap.


def test_ac8_trt_ep_options_carry_budget_workspace_cap(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    runtime.deserialize_engine(entry)
    opts = fake_ort.session_kwargs["provider_options"]
    # First provider option dict corresponds to TRT EP.
    trt_opts = opts[0]
    budget = int(config.components["c7_inference"].gpu_memory_budget_bytes)
    assert trt_opts["trt_max_workspace_size"] == budget // 4
    assert trt_opts["trt_engine_cache_enable"] is True
    assert Path(trt_opts["trt_engine_cache_path"]).exists()


# ----------------------------------------------------------------------
# Risk 2: CPU-fallback handling.


def test_cpu_fallback_emits_record_when_warn_allowed(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession(active_providers=(CPU_EP,))
    fake_ort = _FakeOrt(fake_session)
    fdr = FdrClient(producer_id="c7_inference.onnx_trt_ep", capacity=16)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
        fdr_client=fdr,
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    runtime.deserialize_engine(entry)
    drained = fdr.drain(max_records=4)
    cpu_records = [r for r in drained if r.kind == "c7.cpu_fallback"]
    assert len(cpu_records) == 1
    payload = cpu_records[0].payload
    assert payload["active_provider"] == CPU_EP
    assert payload["requested_providers"] == [TRT_EP, CUDA_EP, CPU_EP]


def test_cpu_fallback_raises_when_disallowed(
    tmp_path: Path,
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession(active_providers=(CPU_EP,))
    fake_ort = _FakeOrt(fake_session)
    config = Config.with_blocks(
        c7_inference=C7InferenceConfig(
            runtime="onnx_trt_ep",
            engine_cache_dir=str(onnx_path.parent / "engines"),
            ort_trt_cache_dir=str(onnx_path.parent / "ort_cache"),
            ort_disallow_cpu_fallback=True,
        )
    )
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    with pytest.raises(
        EngineDeserializeError, match="CPUExecutionProvider"
    ):
        runtime.deserialize_engine(entry)


# ----------------------------------------------------------------------
# NFR-reliability: ORT internal exceptions rewrap into the AZ-297 family.


def test_nfr_reliability_session_creation_failure_rewraps(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_ort = _FakeOrt(_FakeOrtSession())
    fake_ort.create_side_effect = RuntimeError("ORT C++ exception")
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    with pytest.raises(EngineDeserializeError, match="ORT C\\+\\+ exception"):
        runtime.deserialize_engine(entry)


def test_nfr_reliability_infer_rewraps_runtime_error(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    fake_session._run_side_effect = RuntimeError(
        "ORT runtime: OrtInvalidArgument"
    )
    with pytest.raises(InferenceError, match="OrtInvalidArgument"):
        runtime.infer(
            handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)}
        )


def test_nfr_reliability_infer_rewraps_memory_error(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    fake_session._run_side_effect = MemoryError("ORT mid-run OOM")
    with pytest.raises(OutOfMemoryError, match="OOM"):
        runtime.infer(
            handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)}
        )


def test_infer_rejects_foreign_handle(runtime_basic: OnnxTrtEpRuntime) -> None:
    with pytest.raises(InferenceError, match="foreign handle"):
        runtime_basic.infer(object(), {})  # type: ignore[arg-type]


def test_infer_rejects_released_handle(
    tmp_path: Path, config: Config
) -> None:
    entry, onnx_path = _make_onnx_entry(tmp_path)
    fake_session = _FakeOrtSession()
    fake_ort = _FakeOrt(fake_session)
    runtime = OnnxTrtEpRuntime(
        config,
        host_tuple_provider=lambda _p: _TIER2_HOST,
        manifest_provider=lambda: _manifest_for(onnx_path),
    )
    runtime._load_ort = lambda: fake_ort  # type: ignore[method-assign]
    handle = runtime.deserialize_engine(entry)
    runtime.release_engine(handle)
    with pytest.raises(InferenceError, match="released handle"):
        runtime.infer(
            handle, {"x": np.ones((1, 3, 224, 224), dtype=np.float32)}
        )


# ----------------------------------------------------------------------
# compile_engine + helpers.


def test_compile_engine_returns_onnx_entry(
    tmp_path: Path, runtime_basic: OnnxTrtEpRuntime
) -> None:
    onnx_path = tmp_path / "model.onnx"
    onnx_path.write_bytes(b"\x08\x07")
    bc = BuildConfig(
        precision=PrecisionMode.FP16,
        workspace_mb=512,
        calibration_dataset=None,
        optimization_profiles=(
            OptimizationProfile(
                input_name="x",
                min_shape=(1, 3, 224, 224),
                opt_shape=(1, 3, 224, 224),
                max_shape=(1, 3, 224, 224),
            ),
        ),
    )
    runtime_basic._host_tuple_provider = lambda _p: _TIER2_HOST
    entry = runtime_basic.compile_engine(onnx_path, bc)
    assert entry.engine_path == onnx_path
    assert entry.sha256_hex == _sha256_of_file(onnx_path)
    assert entry.precision is PrecisionMode.FP16
    assert entry.extras["model_name"] == "model"


def test_compile_engine_missing_onnx_raises(
    tmp_path: Path, runtime_basic: OnnxTrtEpRuntime
) -> None:
    runtime_basic._host_tuple_provider = lambda _p: _TIER2_HOST
    bc = BuildConfig(
        precision=PrecisionMode.FP16,
        workspace_mb=512,
        calibration_dataset=None,
        optimization_profiles=(),
    )
    with pytest.raises(EngineDeserializeError, match="not found"):
        runtime_basic.compile_engine(tmp_path / "nope.onnx", bc)


def test_deserialize_engine_unknown_suffix_raises(
    tmp_path: Path, config: Config
) -> None:
    bogus = tmp_path / "model.tflite"
    bogus.write_bytes(b"x")
    entry = EngineCacheEntry(
        engine_path=bogus,
        sha256_hex="x" * 64,
        sm=87,
        jp="6.2",
        trt="10.3",
        precision=PrecisionMode.FP16,
        extras={},
    )
    runtime = OnnxTrtEpRuntime(config)
    with pytest.raises(EngineDeserializeError, match="unsupported"):
        runtime.deserialize_engine(entry)


# ----------------------------------------------------------------------
# thermal_state delegation.


def test_thermal_state_default_safe(runtime_basic: OnnxTrtEpRuntime) -> None:
    snapshot = runtime_basic.thermal_state()
    assert isinstance(snapshot, ThermalState)
    assert snapshot.is_telemetry_available is False
    assert snapshot.thermal_throttle_active is False


def test_thermal_state_delegates_to_publisher(config: Config) -> None:
    canned = ThermalState(
        cpu_temp_c=44.0,
        gpu_temp_c=58.0,
        thermal_throttle_active=True,
        measured_clock_mhz=624,
        measured_at_ns=1_000_000_000,
        is_telemetry_available=True,
    )

    class _Publisher:
        def read(self) -> ThermalState:
            return canned

    runtime = OnnxTrtEpRuntime(config, thermal_publisher=_Publisher())
    assert runtime.thermal_state() is canned


# ----------------------------------------------------------------------
# Tier-2 placeholders — real ORT path lives in the AZ-299 microbench harness.


@_REQUIRE_ORT
@pytest.mark.tier2
def test_ac4_numerical_match_against_tensorrt_direct(
    tmp_path: Path, config: Config
) -> None:  # pragma: no cover - Tier-2 only
    pytest.skip(_TIER2_REASON)


@_REQUIRE_ORT
@pytest.mark.tier2
def test_nfr_perf_session_create_first_p95_under_30s(
    tmp_path: Path, config: Config
) -> None:  # pragma: no cover - Tier-2 only
    pytest.skip(_TIER2_REASON)


@_REQUIRE_ORT
@pytest.mark.tier2
def test_nfr_perf_session_create_subsequent_p95_under_5s(
    tmp_path: Path, config: Config
) -> None:  # pragma: no cover - Tier-2 only
    pytest.skip(_TIER2_REASON)