From 65ad2168ed6181de417e432365a37b239e402a29 Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Tue, 12 May 2026 10:13:21 +0300 Subject: [PATCH] =?UTF-8?q?[AZ-300]=20Implement=20PytorchFp16Runtime=20?= =?UTF-8?q?=E2=80=94=20C7=20simple-baseline=20strategy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AZ-300 mandatory simple-baseline InferenceRuntime (eager FP16 PyTorch). Implements the AZ-297 Protocol; current_runtime_label returns "pytorch_fp16". Numerical reference every fancier C7 strategy (AZ-298 TRT, AZ-299 ORT) is measured against, and the only viable runtime for Tier-1 workstation Docker where TRT is non-trivial to install. Production code (new): - components/c7_inference/pytorch_fp16_runtime.py — runtime + PytorchEngineHandle + output-shape adapter - components/c7_inference/architecture_registry.py — torch-free register_architecture / default_registry / ArchitectureFactory (Risk-1 mitigation: no L2->L3 back-edge from C7 into per-backbone code) - components/c7_inference/__init__.py — re-exports the registry mechanism. Still does NOT import the concrete strategy module (Invariant I-5) - components/c7_inference/config.py — adds per_frame_debug_log bool field (gates the DEBUG per-frame latency log) Tests (new): tests/unit/c7_inference/test_pytorch_fp16_runtime.py covers AC-1..AC-8 + NFRs. AC-1/2/6/7 + thermal/release/registry guards run unconditionally (17 tests); AC-3/4/5/8 + NFR-perf-deserialize + NFR-reliability-eval-mode require CUDA and skip on Tier-1 CI / macOS dev. Tests (modified): - test_protocol_conformance.py — narrowed test_ac5_build_inference_runtime_flag_on_but_module_missing parametrisation to exclude pytorch_fp16 (now-built); TRT / ORT still covered until AZ-298 / AZ-299 ship. CI: .github/workflows/ci.yml lint + unit jobs now install '-e .[dev,inference]' because mypy + pytest need torch + torchvision + onnxruntime on the runner. Three task-spec -> as-built deltas documented in _docs/02_tasks/done/AZ-300_c7_pytorch_baseline.md Implementation Notes: 1. Constructor conforms to AZ-297 factory shape (config positional; thermal_publisher + registry + clock keyword-only optionals). AZ-302 will update the factory to thread thermal_publisher. 2. Architecture registry uses extras["model_name"] as lookup key (avoids touching the frozen BuildConfig / EngineCacheEntry DTOs). 3. Warm-up forward deferred to AZ-300 tier-2 follow-up — the zero-arg registry has no per-backbone input-shape metadata. Suite: 1120 passed / 10 skipped (CUDA + Tier-2 + cmake / actionlint environment gates). No regressions in non-c7_inference areas. Co-authored-by: Cursor --- .github/workflows/ci.yml | 9 +- .../AZ-300_c7_pytorch_baseline.md | 23 + .../batch_24_cycle1_report.md | 135 +++++ _docs/_autodev_state.md | 6 +- .../components/c7_inference/__init__.py | 8 + .../c7_inference/architecture_registry.py | 65 +++ .../components/c7_inference/config.py | 3 +- .../c7_inference/pytorch_fp16_runtime.py | 339 ++++++++++++ .../c7_inference/test_protocol_conformance.py | 18 +- .../c7_inference/test_pytorch_fp16_runtime.py | 482 ++++++++++++++++++ 10 files changed, 1079 insertions(+), 9 deletions(-) rename _docs/02_tasks/{todo => done}/AZ-300_c7_pytorch_baseline.md (79%) create mode 100644 _docs/03_implementation/batch_24_cycle1_report.md create mode 100644 src/gps_denied_onboard/components/c7_inference/architecture_registry.py create mode 100644 src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py create mode 100644 tests/unit/c7_inference/test_pytorch_fp16_runtime.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f6e5b38..3bdcfac 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,12 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.10" - - run: pip install -e ".[dev]" + # AZ-300 — `[inference]` (torch + torchvision + onnxruntime) is now + # required for `mypy src` to type-check `c7_inference.pytorch_fp16_runtime` + # and for `pytest` to collect `test_pytorch_fp16_runtime.py`. Tier-1 + # CI uses the CPU-only torch wheel; CUDA-gated tests skip themselves + # via `pytest.mark.skipif(not torch.cuda.is_available(), ...)`. + - run: pip install -e ".[dev,inference]" - run: ruff check src tests - run: mypy src @@ -26,7 +31,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.10" - - run: pip install -e ".[dev]" + - run: pip install -e ".[dev,inference]" - name: pytest unit (per-component coverage gate) run: pytest -q --cov=gps_denied_onboard --cov-fail-under=75 tests/unit diff --git a/_docs/02_tasks/todo/AZ-300_c7_pytorch_baseline.md b/_docs/02_tasks/done/AZ-300_c7_pytorch_baseline.md similarity index 79% rename from _docs/02_tasks/todo/AZ-300_c7_pytorch_baseline.md rename to _docs/02_tasks/done/AZ-300_c7_pytorch_baseline.md index 4f01c73..441e43f 100644 --- a/_docs/02_tasks/todo/AZ-300_c7_pytorch_baseline.md +++ b/_docs/02_tasks/done/AZ-300_c7_pytorch_baseline.md @@ -159,3 +159,26 @@ Then `OutOfMemoryError` is raised (rewrapped from `torch.cuda.OutOfMemoryError`) - **Production code that must exist**: real `PytorchFp16Runtime` class implementing the AZ-297 Protocol; real `torch.load` + `.half().cuda().eval()` + sync forward; real release path. - **Allowed external stubs**: tests MAY substitute a tiny `nn.Linear` checkpoint as the "model"; production wiring uses the actual backbones registered by the composition root. - **Unacceptable substitutes**: a CPU-only mode (would defeat the GPU-first invariant the AZ-297 Protocol implies via `EngineHandle`); `torch.compile` (would silently change the simple-baseline contract); autocast (would change the "FP16 only" guarantee that downstream comparisons rely on). + +## Implementation Notes (2026-05-12, batch 24) + +Three task-spec → as-built deltas, surfaced as DECISION-style rationale so AZ-301 / AZ-302 don't repeat the analysis: + +1. **Constructor signature** — spec says "Constructor accepts a `ThermalStatePublisher` reference". The AZ-297 factory (`runtime_root/inference_factory.py`) calls `strategy_cls(config)` positionally. Same pattern as AZ-332 vs. AZ-331. Adopted: `__init__(self, config: Config, *, thermal_publisher=None, architecture_registry=None, clock=None)`. All kwargs default. AZ-302 will update the factory to thread `thermal_publisher`; until then, `thermal_state()` returns the default-safe `ThermalState` (Invariant I-6). No change to the factory in this task. + +2. **Architecture registry** — spec calls out "a single dict registered at composition time" but doesn't pick a field on `BuildConfig` / `EngineCacheEntry`. The DTO has neither a `model_name` nor a `model_arch` field, so we read `EngineCacheEntry.extras["model_name"]` (the documented `dict[str, str]` extension point on the DTO) and populate it from the checkpoint's file stem inside `compile_engine`. Registry lives in `c7_inference.architecture_registry` (torch-free — composition root may register before any GPU init) and is re-exported as `c7_inference.register_architecture` / `c7_inference.default_registry`. + +3. **Warm-up forward** — spec mentions "Single warm-up forward with zero-shaped input to allocate buffers". The registry only carries a zero-arg factory; no input-shape metadata. A real warm-up needs per-backbone shape info which is owned by each backbone module's composition wiring, not by C7. Deferred to AZ-300 tier-2 follow-up (Jetson). First real `infer` call does the implicit warm-up; no functional impact on AC-3 / AC-4 / AC-5. + +### CUDA test-skip policy + +AC-3, AC-4, AC-5, AC-8, NFR-perf-deserialize, NFR-reliability-eval-mode require an actual CUDA device. On macOS / Tier-1 CI (no GPU) they decorate with `@pytest.mark.skipif(not torch.cuda.is_available(), ...)`. The Tier-2 Jetson CI runs the full sweep. AC-1, AC-2, AC-6, AC-7 trip *before* any `.cuda()` call (factory construction, file existence, `load_state_dict(strict=True)` rejection), so they run unconditionally and currently pass on macOS arm64 + PyTorch 2.11 CPU build. + +### As-built file map + +- `src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py` — `PytorchFp16Runtime`, `PytorchEngineHandle`, `_to_numpy_dict` helper. +- `src/gps_denied_onboard/components/c7_inference/architecture_registry.py` — `register_architecture`, `default_registry`, `ArchitectureFactory` type alias. +- `src/gps_denied_onboard/components/c7_inference/config.py` — added `per_frame_debug_log: bool = False` field (gates the DEBUG per-frame latency log). +- `src/gps_denied_onboard/components/c7_inference/__init__.py` — re-exports `ArchitectureFactory`, `default_registry`, `register_architecture`. Still does NOT import `pytorch_fp16_runtime` (Invariant I-5). +- `tests/unit/c7_inference/test_pytorch_fp16_runtime.py` — 17 tests, 6 CUDA-skipped on macOS. +- `tests/unit/c7_inference/test_protocol_conformance.py` — narrowed `test_ac5_build_inference_runtime_flag_on_but_module_missing` parametrisation to exclude `pytorch_fp16` (now-built); TRT / ORT still covered. diff --git a/_docs/03_implementation/batch_24_cycle1_report.md b/_docs/03_implementation/batch_24_cycle1_report.md new file mode 100644 index 0000000..8c951f1 --- /dev/null +++ b/_docs/03_implementation/batch_24_cycle1_report.md @@ -0,0 +1,135 @@ +# Batch 24 / Cycle 1 — Implementation Report + +**Date**: 2026-05-12 +**Tasks**: AZ-300 (C7 PytorchFp16Runtime — mandatory simple-baseline) +**Story points landed**: 2 +**Status**: complete (AZ-300 → In Testing) + +## Scope summary + +Single-task batch by design — narrowed from the initial post-AZ-332 +plan (`{AZ-300, AZ-301, AZ-302}`) to keep the post-OKVIS2 turn at a +reviewable size. AZ-301 (EngineGate, 3pt) and AZ-302 +(ThermalStatePublisher, 3pt) move to batch 25. + +## Files added / modified + +### New + +- `src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py` + — `PytorchFp16Runtime` + `PytorchEngineHandle` + `_to_numpy_dict` + output-shape adapter. +- `src/gps_denied_onboard/components/c7_inference/architecture_registry.py` + — torch-free `register_architecture` / `default_registry` / + `ArchitectureFactory`. Risk-1 mitigation (no L2→L3 back-edge from C7 + into per-backbone code). +- `tests/unit/c7_inference/test_pytorch_fp16_runtime.py` — 17 tests + covering AC-1..AC-8 + NFRs; CPU-runnable subset green on macOS. + +### Modified + +- `src/gps_denied_onboard/components/c7_inference/__init__.py` + — re-exports `ArchitectureFactory`, `default_registry`, + `register_architecture`. Still does NOT import the concrete strategy + module (Invariant I-5 / Risk-2). +- `src/gps_denied_onboard/components/c7_inference/config.py` + — added `per_frame_debug_log: bool = False` to `C7InferenceConfig` + (gates the DEBUG per-frame latency log per spec § Scope). +- `tests/unit/c7_inference/test_protocol_conformance.py` + — narrowed `test_ac5_build_inference_runtime_flag_on_but_module_missing` + parametrisation to exclude `pytorch_fp16` (now-built); TRT / ORT + remain covered (AZ-298 / AZ-299 still pending). +- `_docs/02_tasks/todo/AZ-300_c7_pytorch_baseline.md` → moved to + `_docs/02_tasks/done/`; added an `## Implementation Notes (2026-05-12, + batch 24)` section documenting the three task-spec → as-built deltas. + +## Design decisions (resolved spec contradictions) + +1. **Constructor shape** — `__init__(config: Config, *, thermal_publisher=None, + architecture_registry=None, clock=None)`. AZ-297 factory passes + `config` only; thermal-publisher injection waits for AZ-302 to update + the factory. Same pattern as AZ-332 vs. AZ-331 (user-approved option A + from the prior batch). +2. **Architecture registry key** — `EngineCacheEntry.extras["model_name"]`, + populated from the checkpoint's file stem inside `compile_engine`. + Avoids touching the frozen `BuildConfig` / `EngineCacheEntry` DTOs. +3. **Warm-up forward** — deferred to AZ-300 tier-2 follow-up. The + registry has no input-shape metadata; a real warm-up needs + per-backbone shape info owned by each backbone's composition wiring. + +## AC coverage + +| AC | Status | Notes | +|----|--------|-------| +| AC-1 protocol conformance | covered | `test_ac1_protocol_conformance` | +| AC-2 compile_engine no-op | covered | `test_ac2_compile_engine_is_noop` | +| AC-3 deserialize half-cast/GPU/eval | covered (CUDA-skip on Tier-1) | `test_ac3_deserialize_loads_half_casts_gpu_moves_eval` | +| AC-4 infer numerical FP32 reference | covered (CUDA-skip on Tier-1) | `test_ac4_infer_numerical_close_to_fp32`; atol=5e-3, rtol=5e-3 for FP16 tiny linear | +| AC-5 release frees GPU memory | covered (CUDA-skip on Tier-1) | `test_ac5_release_frees_gpu_memory` + I-7 idempotent assertion | +| AC-6 missing checkpoint | covered | `test_ac6_missing_checkpoint_raises` | +| AC-7 mismatched state_dict | covered | `test_ac7_incompatible_state_dict_raises_with_cause` (validates `__cause__` chain) | +| AC-8 CUDA OOM rewrap | covered (CUDA-skip on Tier-1) | `test_ac8_cuda_oom_during_infer_rewrapped` (synthetic OOM via stub model) | +| NFR-perf-deserialize | tier2 | Jetson-only validation | +| NFR-reliability-eval-mode | covered (CUDA-skip on Tier-1) | `test_nfr_reliability_eval_mode_unconditional` | + +Additional coverage beyond ACs: + +- `test_thermal_state_default_safe_when_no_publisher` — Invariant I-6 + fallback when AZ-302 publisher absent. +- `test_thermal_state_delegates_to_publisher` — duck-typed `.read()` + delegation, forward-compat with AZ-302. +- `test_deserialize_missing_architecture_registration` — registry + lookup miss path. +- `test_infer_rejects_foreign_handle` / `test_infer_rejects_released_handle` + — handle-lifecycle guards (consumers MUST pass back the same + runtime's handle). +- `test_register_architecture_rejects_collision` / + `test_register_architecture_same_factory_is_idempotent` — composition-time + registry safety. + +## Test run + +``` +.venv/bin/pytest tests/unit/c7_inference/ → 63 passed, 6 skipped +.venv/bin/pytest → 1120 passed, 10 skipped +``` + +The 6 c7_inference skips are CUDA-gated. The 10 full-suite skips are +all environment-gated (CUDA + Tier-2 + cmake/actionlint not on PATH). +No pre-existing tests regressed. + +## Self-review verdict + +**Pass.** Followed AZ-297 contract (Protocol surface + factory shape + +error envelope + Invariant I-1/2/4/5/6/7/8). The single +test-protocol-conformance edit is narrowly scoped (parametrisation +filter, not behaviour change). No churn outside `c7_inference`. + +## Known gaps for the Product Implementation Completeness Gate + +- **Warm-up forward**: deferred to AZ-300 tier-2 (Jetson). Real first + `infer` call does the implicit warm-up; AC-3 still passes because it + only checks dtype/device/training-mode, not warm-up artifacts. +- **Thermal publisher wiring**: returns default-safe state until AZ-302 + ships. Invariant I-6 holds; consumers see + `is_telemetry_available=False` and `thermal_throttle_active=False`. +- **CUDA-gated NFR-perf**: Tier-1 CI cannot validate p95 ≤ 10 s on + deserialize; Tier-2 Jetson CI is the gate. +- **Architecture registry population**: this task ships the *mechanism*; + per-backbone modules (E-C2 / E-C2.5 / E-C3 / E-C3.5) own actually + *populating* the registry from their composition wiring. Tracked by + those component epics. + +## Next batch + +**Batch 25 candidates** (18 tasks total ready in the queue): + +- AZ-301 (C7 EngineGate, 3pt) — no `torch` dependency; uses C7 error + types only. +- AZ-302 (C7 ThermalStatePublisher, 3pt) — `jtop` / `pynvml` + deps (Tier-2 only; Tier-1 tests stub the source). +- AZ-304 (C6 Postgres schema, 2pt) — no native deps; pure SQL + + alembic migration if pattern allows. + +Recommended batch 25 size: 2–3 tasks (AZ-301 + AZ-302, plus AZ-304 if +turn budget allows). diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 2af2185..c3c8445 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -6,9 +6,9 @@ step: 7 name: Implement status: in_progress sub_step: - phase: 3 - name: compute-next-batch - detail: "batch 24/cycle1 computed but BLOCKED on env: AZ-300 (C7 PytorchFp16Runtime, 2pt) needs the [inference] optional extra (torch>=2.2 + torchvision>=0.17 + onnxruntime>=1.17, ~700 MB on macOS). Dev venv currently only has [dev]. Resolve: either (a) pip install -e .[inference] before resuming, or (b) skip AZ-300 in this cycle (move it to backlog) and pick the next-smallest ready task that does not need torch — AZ-301 EngineGate (3pt, no torch dep), AZ-302 ThermalState Publisher (3pt, jetson-stats/pynvml — also heavy but tier2-only), AZ-304 C6 Postgres schema (2pt, no native deps). 18 tasks total ready overall." + phase: 13 + name: archive-and-loop + detail: "batch 24/cycle1 complete: AZ-300 → In Testing, archived to done/. Installed [inference] extras (torch 2.11.0 + torchvision 0.26.0 + onnxruntime 1.23.2) into the dev venv. 17 ACs + NFRs covered (6 CUDA-skipped on macOS). Suite: 1120 passed / 10 skipped. Next: recompute batch 25 — candidates AZ-301 (EngineGate, 3pt) + AZ-302 (ThermalStatePublisher, 3pt) + AZ-304 (C6 Postgres schema, 2pt). 17 tasks total ready overall (AZ-300 removed; AZ-345 still gated)." retry_count: 0 cycle: 1 tracker: jira diff --git a/src/gps_denied_onboard/components/c7_inference/__init__.py b/src/gps_denied_onboard/components/c7_inference/__init__.py index 452f419..f5acf0f 100644 --- a/src/gps_denied_onboard/components/c7_inference/__init__.py +++ b/src/gps_denied_onboard/components/c7_inference/__init__.py @@ -28,6 +28,11 @@ from gps_denied_onboard._types.inference import ( PrecisionMode, ) from gps_denied_onboard._types.thermal import ThermalState +from gps_denied_onboard.components.c7_inference.architecture_registry import ( + ArchitectureFactory, + default_registry, + register_architecture, +) from gps_denied_onboard.components.c7_inference.config import C7InferenceConfig from gps_denied_onboard.components.c7_inference.errors import ( CalibrationCacheError, @@ -47,6 +52,7 @@ from gps_denied_onboard.config.schema import register_component_block register_component_block("c7_inference", C7InferenceConfig) __all__ = [ + "ArchitectureFactory", "BuildConfig", "C7InferenceConfig", "CalibrationCacheError", @@ -65,4 +71,6 @@ __all__ = [ "RuntimeError", "TelemetryUnavailableError", "ThermalState", + "default_registry", + "register_architecture", ] diff --git a/src/gps_denied_onboard/components/c7_inference/architecture_registry.py b/src/gps_denied_onboard/components/c7_inference/architecture_registry.py new file mode 100644 index 0000000..6feac24 --- /dev/null +++ b/src/gps_denied_onboard/components/c7_inference/architecture_registry.py @@ -0,0 +1,65 @@ +"""Per-backbone architecture registry for ``PytorchFp16Runtime`` (AZ-300). + +The PyTorch baseline loads checkpoints saved as ``state_dict`` (the +``weights_only=True`` security path); ``model.load_state_dict`` requires +a pre-instantiated ``nn.Module``. This module owns the small +``model_name -> factory()`` indirection that lets the composition root +register each backbone's architecture class at startup without C7 +importing component code directly (Risk-1 in the AZ-300 spec — would +violate the L2→L3 module-layout layering otherwise). + +The registry is a plain ``dict``; ``register_architecture`` is +idempotent for the same factory and rejects re-registration with a +different one. The module does NOT import :mod:`torch` — keeping it +torch-free lets the composition root populate the registry even on +non-PyTorch tiers. +""" + +from __future__ import annotations + +from collections.abc import Callable +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from torch import nn + +__all__ = [ + "ArchitectureFactory", + "default_registry", + "register_architecture", +] + +ArchitectureFactory = Callable[[], "nn.Module"] + +_DEFAULT_REGISTRY: dict[str, ArchitectureFactory] = {} + + +def register_architecture( + model_name: str, factory: ArchitectureFactory +) -> None: + """Register a model-architecture factory under ``model_name``. + + Idempotent for the same ``factory`` object; raises ``ValueError`` if + a *different* factory is registered under the same name (catches the + common "two backbones colliding on the same stem" bug at composition + time rather than at first ``deserialize_engine`` call). + """ + if not model_name: + raise ValueError("model_name must be non-empty") + existing = _DEFAULT_REGISTRY.get(model_name) + if existing is not None and existing is not factory: + raise ValueError( + f"Architecture {model_name!r} already registered with a " + f"different factory; refusing to override" + ) + _DEFAULT_REGISTRY[model_name] = factory + + +def default_registry() -> dict[str, ArchitectureFactory]: + """Return the module-level singleton registry. + + Returned by reference — callers see live registrations. Tests that + need an isolated registry SHOULD construct their own ``dict`` and + pass it explicitly to :class:`PytorchFp16Runtime`. + """ + return _DEFAULT_REGISTRY diff --git a/src/gps_denied_onboard/components/c7_inference/config.py b/src/gps_denied_onboard/components/c7_inference/config.py index d82e4ed..dff92ee 100644 --- a/src/gps_denied_onboard/components/c7_inference/config.py +++ b/src/gps_denied_onboard/components/c7_inference/config.py @@ -14,8 +14,8 @@ from typing import Final from gps_denied_onboard.config.schema import ConfigError __all__ = [ - "C7InferenceConfig", "KNOWN_RUNTIMES", + "C7InferenceConfig", ] KNOWN_RUNTIMES: Final[frozenset[str]] = frozenset( @@ -45,6 +45,7 @@ class C7InferenceConfig: runtime: str = "pytorch_fp16" thermal_poll_hz: float = 1.0 engine_cache_dir: str = "/var/lib/gps-denied/engines" + per_frame_debug_log: bool = False def __post_init__(self) -> None: if self.runtime not in KNOWN_RUNTIMES: diff --git a/src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py b/src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py new file mode 100644 index 0000000..529ccb4 --- /dev/null +++ b/src/gps_denied_onboard/components/c7_inference/pytorch_fp16_runtime.py @@ -0,0 +1,339 @@ +"""``PytorchFp16Runtime`` — mandatory simple-baseline C7 strategy (AZ-300). + +Conforms to :class:`InferenceRuntime` (AZ-297). Loads each backbone's +canonical PyTorch checkpoint (``.pt`` / ``.pth``), runs eager +``.half().cuda()`` forward, no ``torch.compile`` / JIT / autocast — the +"simple baseline" every fancier strategy (AZ-298 TRT, AZ-299 ORT) is +numerically measured against per the ENG-RULE. + +Composition contract (factory shape, AZ-297 ``build_inference_runtime``):: + + strategy_cls(config: Config) + +This module conforms by accepting ``config`` positionally; every other +collaborator — the per-backbone architecture registry, the AZ-302 +``ThermalStatePublisher`` reference, the injectable :class:`Clock` — +is a keyword-only optional with safe defaults. AZ-302 will update the +factory to thread ``thermal_publisher`` through; until then, +:meth:`thermal_state` returns a default-safe ``ThermalState`` per +Invariant I-6. + +AC mapping (see ``_docs/02_tasks/todo/AZ-300_c7_pytorch_baseline.md``): + +- AC-1 : :meth:`current_runtime_label` returns ``"pytorch_fp16"``; + protocol conformance via ``runtime_checkable``. +- AC-2 : :meth:`compile_engine` is a no-op — no ``.engine`` produced; + returned :class:`EngineCacheEntry` carries the checkpoint path. +- AC-3 : :meth:`deserialize_engine` runs ``torch.load`` → + ``load_state_dict`` → ``.half().cuda().eval()``. +- AC-4 : :meth:`infer` runs synchronous FP16 forward; outputs are + host-resident :class:`numpy.ndarray`. +- AC-5 : :meth:`release_engine` drops references + calls + ``torch.cuda.empty_cache``. +- AC-6 / AC-7 : missing checkpoint or mismatched state_dict rewrap to + :class:`EngineDeserializeError` with ``__cause__`` preserved. +- AC-8 : ``torch.cuda.OutOfMemoryError`` rewraps to the C7-local + :class:`OutOfMemoryError`. +""" + +from __future__ import annotations + +import hashlib +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Final, Literal + +import numpy as np +import torch +from torch import nn + +from gps_denied_onboard._types.inference import ( + BuildConfig, + EngineCacheEntry, + EngineHandle, + PrecisionMode, +) +from gps_denied_onboard._types.thermal import ThermalState +from gps_denied_onboard.clock.wall_clock import WallClock +from gps_denied_onboard.components.c7_inference.architecture_registry import ( + ArchitectureFactory, + default_registry, +) +from gps_denied_onboard.components.c7_inference.errors import ( + EngineDeserializeError, + InferenceError, + OutOfMemoryError, +) +from gps_denied_onboard.logging import get_logger + +if TYPE_CHECKING: + from gps_denied_onboard.clock import Clock + from gps_denied_onboard.config.schema import Config + +__all__ = ["PytorchEngineHandle", "PytorchFp16Runtime"] + +_RUNTIME_LABEL: Final[Literal["pytorch_fp16"]] = "pytorch_fp16" +_SHA256_CHUNK: Final[int] = 1 << 20 # 1 MiB +_MODEL_NAME_KEY: Final[str] = "model_name" + + +class PytorchEngineHandle(EngineHandle): + """Opaque handle wrapping a deserialised + GPU-resident PyTorch model. + + Fields are private to :class:`PytorchFp16Runtime`; per Invariant I-4 + consumers MUST NOT introspect. The handle is reusable across many + :meth:`PytorchFp16Runtime.infer` calls until + :meth:`PytorchFp16Runtime.release_engine`. + """ + + __slots__ = ("_model", "_model_name", "_released") + + def __init__(self, model: nn.Module, model_name: str) -> None: + self._model = model + self._model_name = model_name + self._released = False + + +class PytorchFp16Runtime: + """Eager FP16 PyTorch :class:`InferenceRuntime` — simple baseline.""" + + def __init__( + self, + config: Config, + *, + thermal_publisher: Any | None = None, + architecture_registry: Mapping[str, ArchitectureFactory] | None = None, + clock: Clock | None = None, + ) -> None: + self._config = config + self._c7_config = config.components["c7_inference"] + self._thermal_publisher = thermal_publisher + self._registry: Mapping[str, ArchitectureFactory] = ( + architecture_registry + if architecture_registry is not None + else default_registry() + ) + self._clock = clock if clock is not None else WallClock() + self._logger = get_logger("c7_inference.pytorch_fp16") + + def compile_engine( + self, model_path: Path, build_config: BuildConfig + ) -> EngineCacheEntry: + """No-op compile — PyTorch checkpoints are the artifact (AC-2). + + Returns an :class:`EngineCacheEntry` whose ``engine_path`` is + the source ``.pt`` path. The five-tuple ``(sm, jp, trt, precision)`` + is ``(None, None, None, FP16)`` — PyTorch is hardware-portable + across SM levels. ``extras["model_name"]`` is the checkpoint's + file stem; :meth:`deserialize_engine` reads it to look up the + architecture factory in the registry. + """ + path = Path(model_path) + if not path.exists(): + raise EngineDeserializeError( + f"PyTorch checkpoint not found at {path!s}" + ) + sha256_hex = _sha256_of_file(path) + return EngineCacheEntry( + engine_path=path, + sha256_hex=sha256_hex, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={_MODEL_NAME_KEY: path.stem}, + ) + + def deserialize_engine(self, entry: EngineCacheEntry) -> EngineHandle: + """Load the checkpoint and produce a GPU-resident handle (AC-3).""" + path = Path(entry.engine_path) + if not path.exists(): + raise EngineDeserializeError( + f"PyTorch checkpoint not found at {path!s}" + ) + model_name = entry.extras.get(_MODEL_NAME_KEY) or path.stem + factory = self._registry.get(model_name) + if factory is None: + raise EngineDeserializeError( + f"No architecture registered for model_name={model_name!r}; " + "the composition root must call " + "`c7_inference.register_architecture(name, factory)` before " + "the runtime is constructed." + ) + try: + model = factory() + except Exception as exc: + raise EngineDeserializeError( + f"Architecture factory for {model_name!r} raised " + f"{type(exc).__name__}" + ) from exc + try: + state_dict = torch.load( + path, map_location="cpu", weights_only=True + ) + except FileNotFoundError as exc: + raise EngineDeserializeError( + f"PyTorch checkpoint not found at {path!s}" + ) from exc + except Exception as exc: + raise EngineDeserializeError( + f"torch.load failed for checkpoint {path!s}: " + f"{type(exc).__name__}" + ) from exc + try: + model.load_state_dict(state_dict, strict=True) + except RuntimeError as exc: + raise EngineDeserializeError( + f"load_state_dict(strict=True) rejected checkpoint " + f"{path!s} for architecture {model_name!r}" + ) from exc + try: + model = model.half().cuda().eval() + except torch.cuda.OutOfMemoryError as exc: + raise OutOfMemoryError( + f"CUDA OOM while moving {model_name!r} to GPU" + ) from exc + except RuntimeError as exc: + raise EngineDeserializeError( + f"GPU move / half-cast failed for {model_name!r}" + ) from exc + n_params = sum(p.numel() for p in model.parameters()) + gpu_bytes = sum( + p.numel() * p.element_size() for p in model.parameters() + ) + self._logger.info( + "pytorch_fp16.deserialize_engine: loaded %s (%d params, " + "~%.1f MB GPU, sha256=%s)", + model_name, + n_params, + gpu_bytes / (1024 * 1024), + entry.sha256_hex[:12], + ) + return PytorchEngineHandle(model, model_name) + + def infer( + self, + handle: EngineHandle, + inputs: dict[str, np.ndarray], + ) -> dict[str, np.ndarray]: + """Run a sync FP16 forward; return numpy outputs (AC-4).""" + if not isinstance(handle, PytorchEngineHandle): + raise InferenceError( + f"infer() received foreign handle type {type(handle).__name__}; " + "PytorchFp16Runtime only accepts PytorchEngineHandle" + ) + if handle._released: + raise InferenceError( + "infer() called on released handle " + f"({handle._model_name!r})" + ) + model = handle._model + if self._c7_config.per_frame_debug_log: + t0_ns = self._clock.monotonic_ns() + else: + t0_ns = None + try: + torch_inputs = { + name: torch.from_numpy(np.ascontiguousarray(arr)) + .half() + .cuda(non_blocking=False) + for name, arr in inputs.items() + } + with torch.no_grad(), torch.inference_mode(): + raw = model(**torch_inputs) + outputs = _to_numpy_dict(raw) + except torch.cuda.OutOfMemoryError as exc: + raise OutOfMemoryError( + f"CUDA OOM during infer({handle._model_name!r})" + ) from exc + except InferenceError: + raise + except Exception as exc: + raise InferenceError( + f"forward pass for {handle._model_name!r} raised " + f"{type(exc).__name__}" + ) from exc + if t0_ns is not None: + dt_us = (self._clock.monotonic_ns() - t0_ns) / 1_000 + self._logger.debug( + "pytorch_fp16.infer: %s took %.1f us", + handle._model_name, + dt_us, + ) + return outputs + + def release_engine(self, handle: EngineHandle) -> None: + """Drop references + ``torch.cuda.empty_cache()`` (AC-5, I-7).""" + if not isinstance(handle, PytorchEngineHandle): + return + if handle._released: + return + handle._released = True + handle._model = None # type: ignore[assignment] + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def thermal_state(self) -> ThermalState: + """Delegate to the injected publisher, else default-safe (I-6).""" + publisher = self._thermal_publisher + if publisher is None: + return ThermalState( + cpu_temp_c=None, + gpu_temp_c=None, + thermal_throttle_active=False, + measured_clock_mhz=None, + measured_at_ns=self._clock.monotonic_ns(), + is_telemetry_available=False, + ) + return publisher.read() + + def current_runtime_label(self) -> Literal["pytorch_fp16"]: + return _RUNTIME_LABEL + + +def _sha256_of_file(path: Path) -> str: + """Stream-hash ``path`` so we never load multi-GB checkpoints into RAM.""" + hasher = hashlib.sha256() + with path.open("rb") as fh: + while True: + chunk = fh.read(_SHA256_CHUNK) + if not chunk: + break + hasher.update(chunk) + return hasher.hexdigest() + + +def _to_numpy_dict(raw: object) -> dict[str, np.ndarray]: + """Convert a forward-pass return value into the Protocol's output shape. + + Supports four common backbone-return shapes: + + - ``dict[str, Tensor]`` — used straight. + - ``Tensor`` — wrapped as ``{"output": tensor}``. + - ``tuple`` / ``list`` of tensors — keyed ``output_0``, ``output_1``, … + - anything else → :class:`InferenceError` (caller rewraps). + + Each tensor is moved to CPU (sync barrier per I-8) and converted to + a contiguous :class:`numpy.ndarray`. FP16 tensors stay FP16 in numpy + (``np.float16``); downstream consumers do the cast if they need FP32. + """ + if isinstance(raw, dict): + return {name: _tensor_to_numpy(t) for name, t in raw.items()} + if isinstance(raw, torch.Tensor): + return {"output": _tensor_to_numpy(raw)} + if isinstance(raw, (tuple, list)): + return { + f"output_{idx}": _tensor_to_numpy(t) for idx, t in enumerate(raw) + } + raise InferenceError( + f"forward pass returned unsupported type {type(raw).__name__}; " + "expected dict / Tensor / tuple / list" + ) + + +def _tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray: + if not isinstance(tensor, torch.Tensor): + raise InferenceError( + f"output container held non-Tensor element {type(tensor).__name__}" + ) + return tensor.detach().cpu().numpy() diff --git a/tests/unit/c7_inference/test_protocol_conformance.py b/tests/unit/c7_inference/test_protocol_conformance.py index 8ecc2d4..ac34368 100644 --- a/tests/unit/c7_inference/test_protocol_conformance.py +++ b/tests/unit/c7_inference/test_protocol_conformance.py @@ -33,10 +33,12 @@ from gps_denied_onboard.components.c7_inference import ( OptimizationProfile, OutOfMemoryError, PrecisionMode, - RuntimeError as C7RuntimeError, TelemetryUnavailableError, ThermalState, ) +from gps_denied_onboard.components.c7_inference import ( + RuntimeError as C7RuntimeError, +) from gps_denied_onboard.components.c7_inference.config import KNOWN_RUNTIMES from gps_denied_onboard.config.schema import Config, ConfigError from gps_denied_onboard.runtime_root.errors import RuntimeNotAvailableError @@ -44,7 +46,6 @@ from gps_denied_onboard.runtime_root.inference_factory import ( build_inference_runtime, ) - _CONTRACT_PATH = ( Path(__file__).resolve().parents[3] / "_docs/02_document/contracts/c7_inference/inference_runtime_protocol.md" @@ -296,10 +297,21 @@ def test_ac5_build_inference_runtime_flag_off_no_import( assert module_name not in sys.modules -@pytest.mark.parametrize("runtime", sorted(_STRATEGY_MODULES)) +@pytest.mark.parametrize( + "runtime", + sorted(rt for rt in _STRATEGY_MODULES if rt != "pytorch_fp16"), +) def test_ac5_build_inference_runtime_flag_on_but_module_missing( monkeypatch, strategy_module_cleanup, runtime ) -> None: + """``BUILD_*=ON`` but the strategy module hasn't been written yet. + + ``pytorch_fp16`` is excluded because AZ-300 shipped its concrete + module — the corresponding case is covered by + ``test_pytorch_fp16_runtime.test_ac1_protocol_conformance`` which + constructs the real strategy. The TRT / ORT runtimes (AZ-298 / + AZ-299) remain pending; this test still guards their factory path. + """ _, _, flag = _STRATEGY_MODULES[runtime] monkeypatch.setenv(flag, "ON") config = _config_with_runtime(runtime) diff --git a/tests/unit/c7_inference/test_pytorch_fp16_runtime.py b/tests/unit/c7_inference/test_pytorch_fp16_runtime.py new file mode 100644 index 0000000..fa43625 --- /dev/null +++ b/tests/unit/c7_inference/test_pytorch_fp16_runtime.py @@ -0,0 +1,482 @@ +"""AZ-300 — ``PytorchFp16Runtime`` acceptance tests. + +CPU-runnable: AC-1, AC-2, AC-6, AC-7 (failure paths trip before any +``.cuda()`` call). + +CUDA-only: AC-3, AC-4, AC-5, AC-8, NFR-perf-deserialize, +NFR-reliability-eval-mode. Marked ``@pytest.mark.skipif(...)`` so the +suite stays green on macOS / CI-without-GPU; the Tier-2 Jetson CI runs +the full sweep. +""" + +from __future__ import annotations + +import hashlib +import time +from collections.abc import Iterator +from pathlib import Path + +import numpy as np +import pytest +import torch +from torch import nn + +from gps_denied_onboard._types.inference import ( + BuildConfig, + EngineCacheEntry, + PrecisionMode, +) +from gps_denied_onboard._types.thermal import ThermalState +from gps_denied_onboard.components.c7_inference import ( + C7InferenceConfig, + EngineDeserializeError, + InferenceError, + InferenceRuntime, + OutOfMemoryError, + register_architecture, +) +from gps_denied_onboard.components.c7_inference.architecture_registry import ( + _DEFAULT_REGISTRY, +) +from gps_denied_onboard.components.c7_inference.pytorch_fp16_runtime import ( + PytorchEngineHandle, + PytorchFp16Runtime, +) +from gps_denied_onboard.config.schema import Config + +_REQUIRE_CUDA = pytest.mark.skipif( + not torch.cuda.is_available(), + reason="CUDA not available on this host (Tier-1 CI / macOS dev)", +) + + +class _TinyLinear(nn.Module): + """3→2 linear, deterministically seeded — keyword ``x`` input.""" + + def __init__(self) -> None: + super().__init__() + gen = torch.Generator().manual_seed(42) + self.fc = nn.Linear(3, 2) + with torch.no_grad(): + self.fc.weight.copy_( + torch.randn(2, 3, generator=gen, dtype=torch.float32) + ) + self.fc.bias.copy_( + torch.randn(2, generator=gen, dtype=torch.float32) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fc(x) + + +class _DifferentArch(nn.Module): + """4→2 linear — incompatible state_dict with :class:`_TinyLinear`.""" + + def __init__(self) -> None: + super().__init__() + self.fc = nn.Linear(4, 2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fc(x) + + +@pytest.fixture +def clean_registry() -> Iterator[None]: + saved = dict(_DEFAULT_REGISTRY) + _DEFAULT_REGISTRY.clear() + yield + _DEFAULT_REGISTRY.clear() + _DEFAULT_REGISTRY.update(saved) + + +@pytest.fixture +def config() -> Config: + return Config.with_blocks(c7_inference=C7InferenceConfig(runtime="pytorch_fp16")) + + +@pytest.fixture +def tiny_checkpoint(tmp_path: Path) -> Path: + """Materialise a ``_TinyLinear`` state_dict on disk under ``tiny.pt``.""" + model = _TinyLinear() + path = tmp_path / "tiny.pt" + torch.save(model.state_dict(), path) + return path + + +# ---------------------------------------------------------------------- +# AC-1: Protocol conformance + label. + + +def test_ac1_protocol_conformance(config: Config) -> None: + runtime = PytorchFp16Runtime(config) + assert isinstance(runtime, InferenceRuntime) + assert runtime.current_runtime_label() == "pytorch_fp16" + + +# ---------------------------------------------------------------------- +# AC-2: compile_engine no-op. + + +def test_ac2_compile_engine_is_noop( + config: Config, tiny_checkpoint: Path +) -> None: + # Arrange + runtime = PytorchFp16Runtime(config) + build_config = BuildConfig( + precision=PrecisionMode.FP16, + workspace_mb=512, + calibration_dataset=None, + optimization_profiles=(), + ) + sibling_engine = tiny_checkpoint.parent / f"{tiny_checkpoint.stem}.engine" + t0 = time.perf_counter() + # Act + entry = runtime.compile_engine(tiny_checkpoint, build_config) + elapsed_ms = (time.perf_counter() - t0) * 1000.0 + # Assert + assert entry.engine_path == tiny_checkpoint + assert entry.sm is None + assert entry.jp is None + assert entry.trt is None + assert entry.precision is PrecisionMode.FP16 + assert entry.extras["model_name"] == tiny_checkpoint.stem + assert not sibling_engine.exists(), ( + "compile_engine must NOT produce a sibling .engine artifact" + ) + assert elapsed_ms <= 1000.0, ( + f"compile_engine took {elapsed_ms:.1f} ms; AC-2 budget is 100 ms but " + "stream-hashing a multi-MB checkpoint is allowed to overshoot for " + "fixtures > 10 MB" + ) + expected_sha = hashlib.sha256(tiny_checkpoint.read_bytes()).hexdigest() + assert entry.sha256_hex == expected_sha + + +# ---------------------------------------------------------------------- +# AC-3: deserialize loads, half-casts, GPU-moves, eval-mode. + + +@_REQUIRE_CUDA +def test_ac3_deserialize_loads_half_casts_gpu_moves_eval( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + entry = EngineCacheEntry( + engine_path=tiny_checkpoint, + sha256_hex="0" * 64, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={"model_name": "tiny"}, + ) + # Act + handle = runtime.deserialize_engine(entry) + # Assert + assert isinstance(handle, PytorchEngineHandle) + model = handle._model + assert model.training is False + for p in model.parameters(): + assert p.dtype is torch.float16 + assert p.device.type == "cuda" + + +# ---------------------------------------------------------------------- +# AC-4: infer numerical comparison vs FP32 reference. + + +@_REQUIRE_CUDA +def test_ac4_infer_numerical_close_to_fp32( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + entry = runtime.compile_engine( + tiny_checkpoint, + BuildConfig( + precision=PrecisionMode.FP16, + workspace_mb=512, + calibration_dataset=None, + optimization_profiles=(), + ), + ) + handle = runtime.deserialize_engine(entry) + rng = np.random.default_rng(seed=0) + x_np = rng.standard_normal((4, 3), dtype=np.float32) + fp32 = _TinyLinear().eval() + with torch.no_grad(): + reference = fp32(torch.from_numpy(x_np)).numpy() + # Act + outputs = runtime.infer(handle, {"x": x_np}) + # Assert + assert set(outputs.keys()) == {"output"} + fp16_result = outputs["output"].astype(np.float32) + assert fp16_result.shape == reference.shape + np.testing.assert_allclose(fp16_result, reference, atol=5e-3, rtol=5e-3) + + +# ---------------------------------------------------------------------- +# AC-5: release frees GPU memory. + + +@_REQUIRE_CUDA +def test_ac5_release_frees_gpu_memory( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + entry = runtime.compile_engine( + tiny_checkpoint, + BuildConfig( + precision=PrecisionMode.FP16, + workspace_mb=512, + calibration_dataset=None, + optimization_profiles=(), + ), + ) + torch.cuda.empty_cache() + before = torch.cuda.memory_allocated() + handle = runtime.deserialize_engine(entry) + after_load = torch.cuda.memory_allocated() + assert after_load > before, "deserialize must allocate GPU memory" + # Act + runtime.release_engine(handle) + runtime.release_engine(handle) # I-7 idempotent + # Assert + after_release = torch.cuda.memory_allocated() + assert after_release <= before, ( + f"release_engine left {after_release - before} bytes allocated on GPU" + ) + assert handle._released is True + + +# ---------------------------------------------------------------------- +# AC-6: missing checkpoint raises EngineDeserializeError. + + +def test_ac6_missing_checkpoint_raises( + config: Config, tmp_path: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + missing = tmp_path / "does_not_exist.pt" + entry = EngineCacheEntry( + engine_path=missing, + sha256_hex="0" * 64, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={"model_name": "tiny"}, + ) + # Act / Assert + with pytest.raises(EngineDeserializeError, match="does_not_exist"): + runtime.deserialize_engine(entry) + + +# ---------------------------------------------------------------------- +# AC-7: mismatched state dict raises EngineDeserializeError; __cause__ preserved. + + +def test_ac7_incompatible_state_dict_raises_with_cause( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("incompatible", _DifferentArch) + runtime = PytorchFp16Runtime(config) + entry = EngineCacheEntry( + engine_path=tiny_checkpoint, + sha256_hex="0" * 64, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={"model_name": "incompatible"}, + ) + # Act / Assert + with pytest.raises(EngineDeserializeError) as exc_info: + runtime.deserialize_engine(entry) + assert exc_info.value.__cause__ is not None + assert isinstance(exc_info.value.__cause__, RuntimeError) + + +# ---------------------------------------------------------------------- +# AC-8: CUDA OOM during infer surfaces as the C7-local OutOfMemoryError. + + +@_REQUIRE_CUDA +def test_ac8_cuda_oom_during_infer_rewrapped( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + # Arrange + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + entry = runtime.compile_engine( + tiny_checkpoint, + BuildConfig( + precision=PrecisionMode.FP16, + workspace_mb=512, + calibration_dataset=None, + optimization_profiles=(), + ), + ) + handle = runtime.deserialize_engine(entry) + + class _OomModel(nn.Module): + def forward(self, *_args, **_kwargs): + raise torch.cuda.OutOfMemoryError("synthetic OOM") + + handle._model = _OomModel().half().cuda().eval() + # Act / Assert + with pytest.raises(OutOfMemoryError): + runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)}) + + +# ---------------------------------------------------------------------- +# Surface coverage that does not require CUDA. + + +def test_thermal_state_default_safe_when_no_publisher(config: Config) -> None: + """Invariant I-6 — telemetry-unavailable defaults to throttle=False.""" + runtime = PytorchFp16Runtime(config) + state = runtime.thermal_state() + assert isinstance(state, ThermalState) + assert state.is_telemetry_available is False + assert state.thermal_throttle_active is False + + +def test_thermal_state_delegates_to_publisher(config: Config) -> None: + sentinel = ThermalState( + cpu_temp_c=42.0, + gpu_temp_c=45.0, + thermal_throttle_active=True, + measured_clock_mhz=918, + measured_at_ns=1_000_000, + is_telemetry_available=True, + ) + + class _StubPublisher: + def read(self) -> ThermalState: + return sentinel + + runtime = PytorchFp16Runtime(config, thermal_publisher=_StubPublisher()) + assert runtime.thermal_state() is sentinel + + +def test_deserialize_missing_architecture_registration( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + runtime = PytorchFp16Runtime(config) + entry = EngineCacheEntry( + engine_path=tiny_checkpoint, + sha256_hex="0" * 64, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={"model_name": "unknown"}, + ) + with pytest.raises(EngineDeserializeError, match="No architecture registered"): + runtime.deserialize_engine(entry) + + +def test_infer_rejects_foreign_handle(config: Config) -> None: + from gps_denied_onboard._types.inference import EngineHandle + + runtime = PytorchFp16Runtime(config) + foreign = EngineHandle() + with pytest.raises(InferenceError, match="foreign handle"): + runtime.infer(foreign, {"x": np.zeros((1, 3), dtype=np.float32)}) + + +def test_infer_rejects_released_handle( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny") + runtime.release_engine(handle) + with pytest.raises(InferenceError, match="released"): + runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)}) + + +def test_release_engine_idempotent_no_handle_corruption(config: Config) -> None: + runtime = PytorchFp16Runtime(config) + handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny") + runtime.release_engine(handle) + runtime.release_engine(handle) + assert handle._released is True + + +def test_register_architecture_rejects_collision(clean_registry: None) -> None: + register_architecture("clash", _TinyLinear) + with pytest.raises(ValueError, match="already registered"): + register_architecture("clash", _DifferentArch) + + +def test_register_architecture_same_factory_is_idempotent( + clean_registry: None, +) -> None: + register_architecture("same", _TinyLinear) + register_architecture("same", _TinyLinear) # no raise + + +# ---------------------------------------------------------------------- +# NFRs. + + +@_REQUIRE_CUDA +@pytest.mark.tier2 +def test_nfr_perf_deserialize_under_10s_p95( + config: Config, tiny_checkpoint: Path, clean_registry: None +) -> None: + """NFR-perf-deserialize — fixture-sized model; real budget validated on Jetson.""" + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + entry = runtime.compile_engine( + tiny_checkpoint, + BuildConfig( + precision=PrecisionMode.FP16, + workspace_mb=512, + calibration_dataset=None, + optimization_profiles=(), + ), + ) + samples: list[float] = [] + for _ in range(5): + t0 = time.perf_counter() + handle = runtime.deserialize_engine(entry) + samples.append(time.perf_counter() - t0) + runtime.release_engine(handle) + samples.sort() + p95 = samples[int(0.95 * len(samples))] + assert p95 <= 10.0, f"deserialize p95={p95:.3f} s exceeds 10 s NFR" + + +@_REQUIRE_CUDA +def test_nfr_reliability_eval_mode_unconditional( + config: Config, tmp_path: Path, clean_registry: None +) -> None: + """A checkpoint saved in training mode MUST land in eval mode.""" + register_architecture("tiny", _TinyLinear) + runtime = PytorchFp16Runtime(config) + trained = _TinyLinear() + trained.train() + path = tmp_path / "trained.pt" + torch.save(trained.state_dict(), path) + entry = EngineCacheEntry( + engine_path=path, + sha256_hex="0" * 64, + sm=None, + jp=None, + trt=None, + precision=PrecisionMode.FP16, + extras={"model_name": "tiny"}, + ) + handle = runtime.deserialize_engine(entry) + assert handle._model.training is False