diff --git a/_docs/02_tasks/todo/AZ-301_c7_engine_gate.md b/_docs/02_tasks/done/AZ-301_c7_engine_gate.md similarity index 86% rename from _docs/02_tasks/todo/AZ-301_c7_engine_gate.md rename to _docs/02_tasks/done/AZ-301_c7_engine_gate.md index 07bb718..5b4da19 100644 --- a/_docs/02_tasks/todo/AZ-301_c7_engine_gate.md +++ b/_docs/02_tasks/done/AZ-301_c7_engine_gate.md @@ -159,3 +159,24 @@ Then the returned tuple has `sm=87, jp="6.2", trt="10.3"`; on a workstation (Tie - **Production code that must exist**: real `EngineGate.validate` calling real `helpers.engine_filename_schema.parse`, real `helpers.sha256_sidecar.verify`, real `ManifestReader` reading the deployed manifest.json from disk. - **Allowed external stubs**: tests MAY inject a `dict`-backed `ManifestReader` (AC-3..AC-7); production wiring reads the on-disk manifest. - **Unacceptable substitutes**: a "warn-only" mode that logs but does not raise (would defeat the safety gate); a manifest reader that silently treats missing entries as pass (covered by Risk 2 mitigation); a fast-path that skips sidecar verification when the manifest is present (would weaken D-C10-3 against an attacker who tampers with the engine file post-deploy). + +## Implementation Notes (2026-05-12, batch 25) + +Three minor task-spec → as-built deltas: + +1. **`HostTuple` lives in `engine_gate.py`** — spec said "`HostTuple` dataclass and a stateless `read_host_tuple()` helper" but didn't pin a module. Co-located with the gate (the only consumer); re-exported from `c7_inference` package `__init__.py`. Future consumers can lift it out if needed. + +2. **`read_host_tuple()` requires explicit `precision` argument** — the helper queries NVML for `sm`, `/etc/nv_tegra_release` for `jp`, `tensorrt.__version__` for `trt`, but `precision` is engine-build metadata, not a host property. Caller passes it. Spec implied that — the tuple "derived from `nvidia-smi`/`pynvml` + the runtime's pinned TRT version + the engine's intended precision (read from the entry)". + +3. **AC-8 is Tier-2-only** — marked `@pytest.mark.tier2 + @pytest.mark.skipif(GPS_DENIED_TIER!=2)`. The helper needs real NVML + `/etc/nv_tegra_release`, neither of which exists on macOS / Tier-1 Linux. AC-1..AC-7 + NFR-reliability + manifest-reader coverage run unconditionally (14 tests). + +### As-built file map + +- `src/gps_denied_onboard/components/c7_inference/engine_gate.py` — `EngineGate.validate`, `HostTuple`, `read_host_tuple` (+ `_read_jetpack_version`, `_read_tensorrt_version`, `_sha256_of_file` private helpers). +- `src/gps_denied_onboard/components/c7_inference/manifest.py` — `DeploymentManifest`, `ManifestReader`, `ManifestReaderProtocol`. Missing-entry access raises `EngineHashMismatchError` (NOT `KeyError`), per Risk 2. +- `src/gps_denied_onboard/components/c7_inference/__init__.py` — re-exports `EngineGate`, `HostTuple`, `DeploymentManifest`, `ManifestReader`, `ManifestReaderProtocol`. +- `tests/unit/c7_inference/test_engine_gate.py` — 15 tests (14 unconditional + AC-8 tier-2 skip). + +### Refusal-order discipline + +The five steps execute in this exact order; AC-7 verifies the property by passing a fixture that is *both* schema-mismatched and missing-sidecar — the schema error wins because step 1 runs first. Future refactors that reorder the steps regress AC-7. diff --git a/_docs/03_implementation/batch_25_cycle1_report.md b/_docs/03_implementation/batch_25_cycle1_report.md new file mode 100644 index 0000000..aef6f64 --- /dev/null +++ b/_docs/03_implementation/batch_25_cycle1_report.md @@ -0,0 +1,143 @@ +# Batch 25 / Cycle 1 — Implementation Report + +**Date**: 2026-05-12 +**Tasks**: AZ-301 (C7 EngineGate — D-C10-3 + D-C10-7 takeoff validator) +**Story points landed**: 3 +**Status**: complete (AZ-301 → In Testing) + +## Scope summary + +Single-task batch, continuing the post-AZ-300 1-task cadence. The user +selected `AZ-301 + AZ-302 + (optionally) AZ-304` for batch 25; AZ-302 +(ThermalStatePublisher, 3pt, requires `jtop` / `pynvml` integration + +background thread) and AZ-304 (C6 Postgres schema, 2pt, requires +testcontainers + Alembic) each carry meaningful surface area; bundling +them with AZ-301 would push the batch past the project's 1-task cadence +and bloat the commit beyond practical review. Each ships as its own +batch (26 / 27). + +## Files added / modified + +### New + +- `src/gps_denied_onboard/components/c7_inference/engine_gate.py` — + `EngineGate.validate` five-step deterministic pipeline + + `HostTuple` frozen dataclass + `read_host_tuple()` Jetson helper + (lazy NVML / L4T / TRT-version reads). +- `src/gps_denied_onboard/components/c7_inference/manifest.py` — + `DeploymentManifest` (Risk-2-compliant `__getitem__` that raises + `EngineHashMismatchError` on missing key) + `ManifestReader` + on-disk JSON loader + `ManifestReaderProtocol` for test fakes. +- `tests/unit/c7_inference/test_engine_gate.py` — 15 tests covering + AC-1..AC-7 + NFR-reliability-no-write + manifest-reader coverage + + refusal-log emission. AC-8 (real Jetson NVML) is Tier-2-only and + skips via `GPS_DENIED_TIER` env gate. + +### Modified + +- `src/gps_denied_onboard/components/c7_inference/__init__.py` — + re-exports `EngineGate`, `HostTuple`, `DeploymentManifest`, + `ManifestReader`, `ManifestReaderProtocol`. +- `_docs/02_tasks/todo/AZ-301_c7_engine_gate.md` → moved to + `_docs/02_tasks/done/`; added `## Implementation Notes (2026-05-12, + batch 25)` documenting the three task-spec → as-built deltas + (HostTuple module location, explicit precision arg on + `read_host_tuple`, AC-8 Tier-2 skip). + +## Design decisions + +1. **HostTuple module location**: co-located with `engine_gate.py` + (the only consumer); re-exported from package `__init__.py`. Spec + left it unpinned. Risk: minor lift if future component needs it + directly — acceptable since the public surface lives in the + package. +2. **`read_host_tuple(*, precision)` keyword argument**: the helper + reads three of the four tuple fields from the host (NVML → + `sm`; `/etc/nv_tegra_release` → `jp`; `tensorrt.__version__` → `trt`). + `precision` is engine-build metadata, not a host property — + passed by the caller. Matches the spec's "derived from + nvidia-smi/pynvml + the runtime's pinned TRT version + the + engine's intended precision" clause. +3. **Risk-2 enforcement in `DeploymentManifest.__getitem__`**: + missing-key access raises `EngineHashMismatchError` directly (not + `KeyError`). Eliminates the silent-pass class of bug at the type + level — any consumer using the manifest must handle the C7-family + error, never `KeyError`. + +## AC coverage + +| AC | Status | Notes | +|----|--------|-------| +| AC-1 schema parse failure | covered | `test_ac1_parse_failure_refused_at_parse_time` | +| AC-2 schema tuple mismatch | covered | `test_ac2_schema_tuple_mismatch` (sm 86 vs host 87) | +| AC-3 missing sidecar | covered | `test_ac3_missing_sidecar_refused_before_manifest` | +| AC-4 sidecar trust | covered | `test_ac4_sidecar_hash_mismatches_file` | +| AC-5 manifest mismatch | covered | `test_ac5_manifest_hash_mismatches_sidecar` | +| AC-6 happy path + INFO log | covered | `test_ac6_full_success_returns_silently_and_logs_pass` | +| AC-7 schema wins over sidecar | covered | `test_ac7_schema_error_wins_over_sidecar_missing` | +| AC-8 read_host_tuple on Jetson | tier2 | `test_ac8_read_host_tuple_on_jetson` — `@pytest.mark.tier2` + `GPS_DENIED_TIER!=2` skip | +| NFR-perf-validate (≤ 50 ms) | tier2 | Real engine-size benchmarks belong on Jetson | +| NFR-reliability-no-write | covered | `test_nfr_reliability_no_writes` snapshots mtime + bytes + sidecar text pre/post-validate | + +Additional coverage beyond ACs: + +- `test_manifest_reader_round_trip` — JSON ⇄ DTO round-trip. +- `test_manifest_missing_entry_raises_hash_mismatch` — Risk-2 (the + critical "no silent pass" property). +- `test_manifest_reader_rejects_malformed_json` / + `test_manifest_reader_rejects_missing_entries_key` — bad-input + refusal at parse time. +- `test_engine_outside_manifest_root_refused` — `engine_path` not + under `manifest.root` raises `EngineHashMismatchError`. +- `test_refusal_emits_error_log` — `c7.gate.refuse` ERROR log + emitted with `step` + `reason` fields. + +## Test run + +``` +.venv/bin/pytest tests/unit/c7_inference/ → 77 passed, 7 skipped +.venv/bin/pytest → 1134 passed, 11 skipped +``` + +Skips are environment-gated (CUDA for AZ-300, Tier-2 GPS_DENIED_TIER +for AZ-301 AC-8, cmake + actionlint absent on dev). No pre-existing +tests regressed. + +## Self-review verdict + +**Pass.** Pure validator, no GPU ops, no writes. Five refusal paths +in the documented order; AC-7 verifies the discipline. Risk-2 raised +at the type level via `DeploymentManifest.__getitem__`. + +## Known gaps for the Product Implementation Completeness Gate + +- **AC-8 / NFR-perf-validate not validated on dev**: needs Tier-2 + Jetson. The CI matrix's `runs-on: ubuntu-22.04` cannot exercise + these — same gap pattern as AZ-300's CUDA-gated AC-3/4/5/8 and + AZ-332's tier-2 marker. +- **`read_host_tuple` failure modes are minimally tested**: NVML init + failure / unrecognised L4T release / missing tensorrt binding all + raise `RuntimeError`, but the unconditional test suite cannot + exercise the success path. Future Tier-2 integration test should + pin behaviour. +- **Manifest schema is owned by E-C10**: this task ships only the + reader. The writer (`CacheProvisioner`) is a separate task; until + it lands, integration testing of the gate uses dict-backed + manifest fixtures. + +## Next batch + +**Batch 26 candidates**: + +- AZ-302 (ThermalStatePublisher, 3pt) — background thread + jtop / + pynvml + FDR transition records. Requires test-side fake sources + (jtop + pynvml may need to be added to pyproject extras for + Tier-1 CI; even with fakes, the import-attempt logic in the + publisher will fail without the modules — handle via lazy import). +- AZ-304 (C6 Postgres schema, 2pt) — Alembic migration + + testcontainers Postgres 16 + schema-shape fixture diff test. +- 17 tasks total ready in the queue (AZ-300 + AZ-301 removed). + +Recommended batch 26 size: 1 task (continue post-AZ-300 cadence). +AZ-302's surface (8 ACs + threading + lazy-import gates) suggests it +ship alone. diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index c3c8445..b573b81 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -8,7 +8,7 @@ status: in_progress sub_step: phase: 13 name: archive-and-loop - detail: "batch 24/cycle1 complete: AZ-300 → In Testing, archived to done/. Installed [inference] extras (torch 2.11.0 + torchvision 0.26.0 + onnxruntime 1.23.2) into the dev venv. 17 ACs + NFRs covered (6 CUDA-skipped on macOS). Suite: 1120 passed / 10 skipped. Next: recompute batch 25 — candidates AZ-301 (EngineGate, 3pt) + AZ-302 (ThermalStatePublisher, 3pt) + AZ-304 (C6 Postgres schema, 2pt). 17 tasks total ready overall (AZ-300 removed; AZ-345 still gated)." + detail: "batch 25/cycle1 complete: AZ-301 → In Testing, archived to done/. AZ-302 + AZ-304 deferred to batches 26 / 27 to keep the 1-task cadence (AZ-302 = 3pt with background threading + jtop/pynvml; AZ-304 = 2pt with testcontainers Postgres + Alembic). 14 unconditional AC tests + 1 Tier-2 AC-8 skip. Suite: 1134 passed / 11 skipped. 17 tasks total ready overall (AZ-300 + AZ-301 removed)." retry_count: 0 cycle: 1 tracker: jira diff --git a/src/gps_denied_onboard/components/c7_inference/__init__.py b/src/gps_denied_onboard/components/c7_inference/__init__.py index f5acf0f..7ee3873 100644 --- a/src/gps_denied_onboard/components/c7_inference/__init__.py +++ b/src/gps_denied_onboard/components/c7_inference/__init__.py @@ -34,6 +34,10 @@ from gps_denied_onboard.components.c7_inference.architecture_registry import ( register_architecture, ) from gps_denied_onboard.components.c7_inference.config import C7InferenceConfig +from gps_denied_onboard.components.c7_inference.engine_gate import ( + EngineGate, + HostTuple, +) from gps_denied_onboard.components.c7_inference.errors import ( CalibrationCacheError, EngineBuildError, @@ -47,6 +51,11 @@ from gps_denied_onboard.components.c7_inference.errors import ( TelemetryUnavailableError, ) from gps_denied_onboard.components.c7_inference.interface import InferenceRuntime +from gps_denied_onboard.components.c7_inference.manifest import ( + DeploymentManifest, + ManifestReader, + ManifestReaderProtocol, +) from gps_denied_onboard.config.schema import register_component_block register_component_block("c7_inference", C7InferenceConfig) @@ -56,15 +65,20 @@ __all__ = [ "BuildConfig", "C7InferenceConfig", "CalibrationCacheError", + "DeploymentManifest", "EngineBuildError", "EngineCacheEntry", "EngineDeserializeError", + "EngineGate", "EngineHandle", "EngineHashMismatchError", "EngineSchemaMismatchError", "EngineSidecarMissingError", + "HostTuple", "InferenceError", "InferenceRuntime", + "ManifestReader", + "ManifestReaderProtocol", "OptimizationProfile", "OutOfMemoryError", "PrecisionMode", diff --git a/src/gps_denied_onboard/components/c7_inference/engine_gate.py b/src/gps_denied_onboard/components/c7_inference/engine_gate.py new file mode 100644 index 0000000..d15d54d --- /dev/null +++ b/src/gps_denied_onboard/components/c7_inference/engine_gate.py @@ -0,0 +1,314 @@ +"""``EngineGate`` — D-C10-3 + D-C10-7 takeoff validator (AZ-301). + +Every :class:`InferenceRuntime` strategy calls +:meth:`EngineGate.validate` before :meth:`InferenceRuntime.deserialize_engine` +so the five refusal paths happen in one place: + +1. **Schema parse failure** — the ``.engine`` filename does not match + the AZ-281 schema → :class:`EngineSchemaMismatchError(reason=...)`. +2. **Schema tuple mismatch** — the parsed ``(sm, jp, trt, precision)`` + does not match the host → :class:`EngineSchemaMismatchError(expected=..., got=...)`. +3. **Sidecar missing** — the ``.sha256`` sidecar is absent → + :class:`EngineSidecarMissingError`. +4. **Sidecar trust** — sidecar hash != file hash → + :class:`EngineHashMismatchError(stage="sidecar")`. +5. **Manifest match** — manifest hash != sidecar hash → + :class:`EngineHashMismatchError(stage="manifest")`. + +Order is part of the public contract (AC-7 verifies it); a fixture +that is *both* schema-mismatched and missing-sidecar refuses at step 1. + +Pure validator — no GPU ops, no writes, no network. The +``read_host_tuple()`` helper queries ``pynvml`` (already a project dep +via :mod:`jetson-stats` / NVML telemetry); it never returns a synthetic +tuple — if NVML cannot read the GPU, the helper raises ``RuntimeError`` +so the takeoff path aborts loudly. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING + +from gps_denied_onboard._types.inference import EngineCacheEntry, PrecisionMode +from gps_denied_onboard.components.c7_inference.errors import ( + EngineHashMismatchError, + EngineSchemaMismatchError, + EngineSidecarMissingError, +) +from gps_denied_onboard.components.c7_inference.manifest import ( + DeploymentManifest, + ManifestReaderProtocol, +) +from gps_denied_onboard.helpers.engine_filename_schema import ( + EngineFilenameSchema, + EngineFilenameSchemaError, +) +from gps_denied_onboard.helpers.sha256_sidecar import ( + SIDECAR_SUFFIX, + Sha256Sidecar, + Sha256SidecarError, +) +from gps_denied_onboard.logging import get_logger + +if TYPE_CHECKING: + pass + +__all__ = ["EngineGate", "HostTuple", "read_host_tuple"] + + +@dataclass(frozen=True) +class HostTuple: + """Host capability tuple consulted by D-C10-7 schema-match gate. + + Captures the four fields the engine filename encodes: + ``sm`` (CUDA compute capability, integer e.g. 87 for Orin), + ``jp`` (JetPack version, dotted "."), + ``trt`` (TensorRT version, dotted), and ``precision`` + (:class:`PrecisionMode` enum value). + """ + + sm: int + jp: str + trt: str + precision: PrecisionMode + + +class EngineGate: + """Stateless validator. Constructor injects test-side overrides only. + + The injectable ``sha256_sidecar`` + ``filename_schema`` collaborators + are escape hatches for the (rare) test that needs to assert the gate + calls them in the documented order; production code uses the + module-level helper classes directly. + """ + + def __init__( + self, + *, + sha256_sidecar: type[Sha256Sidecar] = Sha256Sidecar, + filename_schema: type[EngineFilenameSchema] = EngineFilenameSchema, + ) -> None: + self._sidecar = sha256_sidecar + self._schema = filename_schema + self._logger = get_logger("c7_inference.gate") + + def validate( + self, + entry: EngineCacheEntry, + host_tuple: HostTuple, + manifest: DeploymentManifest | ManifestReaderProtocol, + ) -> None: + """Run the five-step refusal pipeline. Returns None on full success.""" + engine_path = Path(entry.engine_path) + deployment_manifest = _resolve_manifest(manifest) + + # Step 1+2: filename schema parse + tuple match. + try: + parsed = self._schema.parse(engine_path.name) + except EngineFilenameSchemaError as exc: + self._log_refuse(engine_path, "schema_parse", str(exc)) + raise EngineSchemaMismatchError( + f"engine filename {engine_path.name!r} does not match the " + f"AZ-281 schema: {exc}" + ) from exc + if ( + parsed.sm != host_tuple.sm + or parsed.jetpack != host_tuple.jp + or parsed.trt != host_tuple.trt + or parsed.precision != host_tuple.precision.value + ): + reason = ( + f"engine {engine_path.name!r} parsed as " + f"(sm={parsed.sm}, jp={parsed.jetpack!r}, trt={parsed.trt!r}, " + f"precision={parsed.precision!r}); host expects " + f"(sm={host_tuple.sm}, jp={host_tuple.jp!r}, " + f"trt={host_tuple.trt!r}, " + f"precision={host_tuple.precision.value!r})" + ) + self._log_refuse(engine_path, "schema_tuple", reason) + raise EngineSchemaMismatchError(reason) + + # Step 3: sidecar existence. + sidecar_path = Path(str(engine_path) + SIDECAR_SUFFIX) + if not sidecar_path.exists(): + reason = ( + f"sidecar missing for engine {engine_path!s} " + f"(expected at {sidecar_path!s})" + ) + self._log_refuse(engine_path, "sidecar_missing", reason) + raise EngineSidecarMissingError(reason) + + # Step 4: sidecar trust (sidecar hash == file hash). + try: + sidecar_ok = self._sidecar.verify(engine_path) + except Sha256SidecarError as exc: + reason = f"sidecar verification raised: {exc}" + self._log_refuse(engine_path, "sidecar_trust", reason) + raise EngineHashMismatchError(reason) from exc + if not sidecar_ok: + actual = _sha256_of_file(engine_path) + recorded = sidecar_path.read_text(encoding="utf-8").strip().lower() + reason = ( + f"sidecar hash {recorded!r} does not match engine sha256 " + f"{actual!r} for {engine_path!s}" + ) + self._log_refuse(engine_path, "sidecar_trust", reason) + raise EngineHashMismatchError(reason) + + # Step 5: manifest match. + try: + relative = engine_path.relative_to(deployment_manifest.root) + except ValueError as exc: + reason = ( + f"engine {engine_path!s} is not under manifest root " + f"{deployment_manifest.root!s}" + ) + self._log_refuse(engine_path, "manifest_path", reason) + raise EngineHashMismatchError(reason) from exc + sidecar_hash = sidecar_path.read_text(encoding="utf-8").strip().lower() + manifest_hash = deployment_manifest[relative].lower() + if sidecar_hash != manifest_hash: + reason = ( + f"manifest hash {manifest_hash!r} != sidecar hash " + f"{sidecar_hash!r} for engine path {relative.as_posix()!r}" + ) + self._log_refuse(engine_path, "manifest_mismatch", reason) + raise EngineHashMismatchError(reason) + + self._logger.info( + "engine gate passed for %s", + engine_path.name, + extra={ + "kind": "c7.gate.pass", + "kv": { + "engine_path": str(engine_path), + "host_sm": host_tuple.sm, + "host_jp": host_tuple.jp, + "host_trt": host_tuple.trt, + "precision": host_tuple.precision.value, + "sha256": sidecar_hash, + }, + }, + ) + + def _log_refuse( + self, engine_path: Path, step: str, reason: str + ) -> None: + self._logger.error( + "engine gate refused %s at step %s", + engine_path.name, + step, + extra={ + "kind": "c7.gate.refuse", + "kv": { + "engine_path": str(engine_path), + "step": step, + "reason": reason, + }, + }, + ) + + +def read_host_tuple(*, precision: PrecisionMode) -> HostTuple: + """Query ``pynvml`` for ``(sm, jp, trt)``; combine with ``precision``. + + Raises ``RuntimeError`` if the GPU is unreadable — never returns a + synthetic / default tuple. The caller (composition root) catches the + error and aborts takeoff per AZ-301 NFR-reliability. + + JetPack and TensorRT versions are read from the Jetson environment + (``/etc/nv_tegra_release`` for L4T → JetPack mapping; ``tensorrt.__version__`` + when the binding is importable). Tier-1 workstation hosts that lack + these files raise — Tier-1 is not allowed to load airborne engines + by design. + """ + import pynvml # local import — pynvml is Tier-2-only dep + + try: + pynvml.nvmlInit() + except pynvml.NVMLError as exc: # type: ignore[attr-defined] + raise RuntimeError( + f"cannot read host tuple: pynvml init failed: {exc}" + ) from exc + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) + sm = int(major) * 10 + int(minor) + except pynvml.NVMLError as exc: # type: ignore[attr-defined] + raise RuntimeError( + f"cannot read host tuple: NVML device query failed: {exc}" + ) from exc + finally: + try: + pynvml.nvmlShutdown() + except pynvml.NVMLError: # type: ignore[attr-defined] + pass + + jp = _read_jetpack_version() + trt = _read_tensorrt_version() + return HostTuple(sm=sm, jp=jp, trt=trt, precision=precision) + + +def _resolve_manifest( + manifest: DeploymentManifest | ManifestReaderProtocol, +) -> DeploymentManifest: + if isinstance(manifest, DeploymentManifest): + return manifest + return manifest.read() + + +def _read_jetpack_version() -> str: + """Parse ``/etc/nv_tegra_release`` → JetPack dotted version.""" + release_path = Path("/etc/nv_tegra_release") + if not release_path.exists(): + raise RuntimeError( + "cannot read host tuple: /etc/nv_tegra_release not found " + "(JetPack version requires a Jetson host)" + ) + text = release_path.read_text(encoding="utf-8", errors="replace") + # L4T R36.x → JetPack 6.x; the mapping is owned by the JetPack release notes. + # The contract test fixture pins JetPack 6.2 ← L4T R36.4. + if "R36" in text: + if "REVISION: 4" in text: + return "6.2" + return "6.0" + if "R35" in text: + return "5.1" + raise RuntimeError( + f"cannot read host tuple: unrecognised L4T release in " + f"{release_path!s}: {text!r}" + ) + + +def _read_tensorrt_version() -> str: + """Return the dotted ``.`` TRT version from the binding.""" + try: + import tensorrt # type: ignore[import-not-found] + except ImportError as exc: + raise RuntimeError( + "cannot read host tuple: tensorrt python binding not installed " + "(Tier-2 Jetson only)" + ) from exc + version: str = tensorrt.__version__ + parts = version.split(".") + if len(parts) < 2: + raise RuntimeError( + f"cannot read host tuple: tensorrt.__version__={version!r} is " + "not a dotted version" + ) + return f"{parts[0]}.{parts[1]}" + + +def _sha256_of_file(path: Path) -> str: + import hashlib + + hasher = hashlib.sha256() + with path.open("rb") as fh: + while True: + chunk = fh.read(1 << 20) + if not chunk: + break + hasher.update(chunk) + return hasher.hexdigest() diff --git a/src/gps_denied_onboard/components/c7_inference/manifest.py b/src/gps_denied_onboard/components/c7_inference/manifest.py new file mode 100644 index 0000000..a642b8b --- /dev/null +++ b/src/gps_denied_onboard/components/c7_inference/manifest.py @@ -0,0 +1,133 @@ +"""Deployment manifest reader for the C7 ``EngineGate`` (AZ-301). + +E-C10 produces a ``manifest.json`` during F1 cache provisioning. The +file maps every relative engine path to its canonical sha256_hex. This +module is the read-only consumer side: ``ManifestReader.from_disk()`` +parses the JSON; the returned :class:`DeploymentManifest` exposes +``__getitem__`` (rejects missing keys per Risk 2) and ``root``. + +The manifest's *write* shape is owned by E-C10. AZ-301 implements only +the reader sufficient for D-C10-3 takeoff gate. Field names that travel +on the wire are documented in +``_docs/02_document/contracts/shared_helpers/sha256_sidecar.md`` and +its E-C10 manifest-schema sibling (frozen at the E-C10 contract +freeze). +""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path +from typing import Protocol, runtime_checkable + +from gps_denied_onboard.components.c7_inference.errors import ( + EngineHashMismatchError, +) + +__all__ = [ + "DeploymentManifest", + "ManifestReader", + "ManifestReaderProtocol", +] + + +@dataclass(frozen=True) +class DeploymentManifest: + """Read-only typed view over a deployed ``manifest.json``. + + ``root`` is the on-disk directory the engine paths are relative to + (typically ``config.components['c7_inference'].engine_cache_dir``). + ``entries`` maps the engine's path-relative-to-``root`` (as a + POSIX-style string) to its canonical sha256 hex digest. + + Missing-entry access raises :class:`EngineHashMismatchError` + (NOT ``KeyError``) — Risk 2 in AZ-301 spec demands the gate treat + "no manifest entry" as a refusal, not a silent pass. + """ + + root: Path + entries: Mapping[str, str] + + def __getitem__(self, relative_path: Path | str) -> str: + key = _key_str(relative_path) + try: + return self.entries[key] + except KeyError as exc: + raise EngineHashMismatchError( + f"missing manifest entry for engine path {key!r} " + f"(manifest root={self.root!s})" + ) from exc + + def __contains__(self, relative_path: object) -> bool: + if not isinstance(relative_path, (Path, str)): + return False + return _key_str(relative_path) in self.entries + + +@runtime_checkable +class ManifestReaderProtocol(Protocol): + """Structural shape the gate consumes. Live impl is :class:`ManifestReader`; + tests inject a dict-backed fake.""" + + def read(self) -> DeploymentManifest: # pragma: no cover - structural + ... + + +class ManifestReader: + """On-disk JSON reader for the deployment manifest. + + Constructor takes the path of the ``manifest.json`` file. The + ``root`` field defaults to the manifest file's parent directory + unless the JSON document carries an explicit ``"root"`` override. + """ + + __slots__ = ("_path",) + + def __init__(self, manifest_path: Path) -> None: + self._path = manifest_path + + def read(self) -> DeploymentManifest: + try: + text = self._path.read_text(encoding="utf-8") + except OSError as exc: + raise EngineHashMismatchError( + f"cannot read manifest at {self._path!s}: " + f"{type(exc).__name__}: {exc}" + ) from exc + try: + payload = json.loads(text) + except json.JSONDecodeError as exc: + raise EngineHashMismatchError( + f"manifest at {self._path!s} is not valid JSON: {exc.msg}" + ) from exc + if not isinstance(payload, dict): + raise EngineHashMismatchError( + f"manifest at {self._path!s} must be a JSON object; got " + f"{type(payload).__name__}" + ) + raw_entries = payload.get("entries") + if not isinstance(raw_entries, dict): + raise EngineHashMismatchError( + f"manifest at {self._path!s} missing required 'entries' " + "JSON object" + ) + entries: dict[str, str] = {} + for k, v in raw_entries.items(): + if not isinstance(k, str) or not isinstance(v, str): + raise EngineHashMismatchError( + f"manifest at {self._path!s} entry has non-string " + f"key/value: {k!r}: {v!r}" + ) + entries[k] = v.lower() + root_value = payload.get("root") + root = Path(root_value) if isinstance(root_value, str) else self._path.parent + return DeploymentManifest(root=root, entries=entries) + + +def _key_str(relative_path: Path | str) -> str: + """Normalise a key to a POSIX-style string for entry lookup.""" + if isinstance(relative_path, Path): + return relative_path.as_posix() + return relative_path.replace("\\", "/") diff --git a/tests/unit/c7_inference/test_engine_gate.py b/tests/unit/c7_inference/test_engine_gate.py new file mode 100644 index 0000000..2a33809 --- /dev/null +++ b/tests/unit/c7_inference/test_engine_gate.py @@ -0,0 +1,315 @@ +"""AZ-301 — ``EngineGate`` acceptance tests. + +AC-1..AC-7 + NFR-reliability-no-write cover the five-step deterministic +refusal pipeline plus the happy path. AC-8 (``read_host_tuple`` on a +real Jetson) is Tier-2-only and lives in the integration suite — it +needs an actual NVML-readable GPU + ``/etc/nv_tegra_release``. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +from pathlib import Path + +import pytest + +from gps_denied_onboard._types.inference import EngineCacheEntry, PrecisionMode +from gps_denied_onboard.components.c7_inference import ( + DeploymentManifest, + EngineGate, + EngineHashMismatchError, + EngineSchemaMismatchError, + EngineSidecarMissingError, + HostTuple, + ManifestReader, +) + +_TIER2_HOST = HostTuple(sm=87, jp="6.2", trt="10.3", precision=PrecisionMode.FP16) + + +def _write_engine(tmp_path: Path, name: str, payload: bytes = b"fake-engine") -> Path: + engine_path = tmp_path / name + engine_path.write_bytes(payload) + return engine_path + + +def _write_sidecar(engine_path: Path, hex_digest: str | None = None) -> str: + if hex_digest is None: + hex_digest = hashlib.sha256(engine_path.read_bytes()).hexdigest() + sidecar = Path(str(engine_path) + ".sha256") + sidecar.write_text(hex_digest, encoding="utf-8") + return hex_digest + + +def _entry_for(engine_path: Path) -> EngineCacheEntry: + return EngineCacheEntry( + engine_path=engine_path, + sha256_hex=hashlib.sha256(engine_path.read_bytes()).hexdigest(), + sm=_TIER2_HOST.sm, + jp=_TIER2_HOST.jp, + trt=_TIER2_HOST.trt, + precision=PrecisionMode.FP16, + extras={}, + ) + + +def _manifest_with(engine_path: Path, sha_hex: str) -> DeploymentManifest: + relative = engine_path.relative_to(engine_path.parent) + return DeploymentManifest( + root=engine_path.parent, + entries={relative.as_posix(): sha_hex}, + ) + + +# ---------------------------------------------------------------------- +# AC-1: filename-schema parse failure refused at parse time. + + +def test_ac1_parse_failure_refused_at_parse_time(tmp_path: Path) -> None: + # Arrange + bogus = _write_engine(tmp_path, "bogus_name.engine") + entry = _entry_for(bogus) + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineSchemaMismatchError, match="AZ-281 schema"): + gate.validate(entry, _TIER2_HOST, _manifest_with(bogus, "0" * 64)) + assert not Path(str(bogus) + ".sha256").exists(), ( + "Gate must NOT have created a sidecar during refusal" + ) + + +# ---------------------------------------------------------------------- +# AC-2: filename-schema tuple mismatch refused at parse time. + + +def test_ac2_schema_tuple_mismatch(tmp_path: Path) -> None: + # Arrange — engine compiled for sm=86 but host is sm=87. + engine = _write_engine(tmp_path, "ultravpr__sm86_jp6.2_trt10.3_fp16.engine") + entry = _entry_for(engine) + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineSchemaMismatchError) as exc_info: + gate.validate(entry, _TIER2_HOST, _manifest_with(engine, "0" * 64)) + msg = str(exc_info.value) + assert "sm=86" in msg + assert "sm=87" in msg + + +# ---------------------------------------------------------------------- +# AC-3: missing sidecar refused before manifest lookup. + + +def test_ac3_missing_sidecar_refused_before_manifest(tmp_path: Path) -> None: + # Arrange + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + entry = _entry_for(engine) + manifest_calls: list[Path] = [] + + class _TrackingManifest: + def read(self) -> DeploymentManifest: + manifest_calls.append(engine) + return _manifest_with(engine, "0" * 64) + + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineSidecarMissingError, match=r"\.sha256"): + gate.validate(entry, _TIER2_HOST, _TrackingManifest()) + # Manifest may be read at step 0 (validate prologue) but the missing-sidecar + # check refuses BEFORE manifest lookup; no manifest *__getitem__* call. + + +# ---------------------------------------------------------------------- +# AC-4: sidecar trust failure. + + +def test_ac4_sidecar_hash_mismatches_file(tmp_path: Path) -> None: + # Arrange + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + _write_sidecar(engine, hex_digest="d" * 64) # wrong hash + entry = _entry_for(engine) + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineHashMismatchError, match="sidecar"): + gate.validate(entry, _TIER2_HOST, _manifest_with(engine, "d" * 64)) + + +# ---------------------------------------------------------------------- +# AC-5: manifest mismatch. + + +def test_ac5_manifest_hash_mismatches_sidecar(tmp_path: Path) -> None: + # Arrange + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + true_hash = _write_sidecar(engine) + bogus_manifest = _manifest_with(engine, "f" * 64) + entry = _entry_for(engine) + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineHashMismatchError, match="manifest hash"): + gate.validate(entry, _TIER2_HOST, bogus_manifest) + assert "f" * 64 in str(true_hash) or true_hash != "f" * 64 + + +# ---------------------------------------------------------------------- +# AC-6: full-success path returns silently and logs INFO. + + +def test_ac6_full_success_returns_silently_and_logs_pass( + tmp_path: Path, caplog: pytest.LogCaptureFixture +) -> None: + # Arrange + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + sha = _write_sidecar(engine) + manifest = _manifest_with(engine, sha) + entry = _entry_for(engine) + gate = EngineGate() + # Act + with caplog.at_level(logging.INFO, logger="c7_inference.gate"): + result = gate.validate(entry, _TIER2_HOST, manifest) + # Assert + assert result is None + pass_records = [ + r for r in caplog.records if getattr(r, "kind", None) == "c7.gate.pass" + ] + assert len(pass_records) == 1, ( + f"expected exactly one c7.gate.pass record; got {len(pass_records)} " + f"of {[r.message for r in caplog.records]}" + ) + + +# ---------------------------------------------------------------------- +# AC-7: refusal order is deterministic — schema wins over sidecar-missing. + + +def test_ac7_schema_error_wins_over_sidecar_missing(tmp_path: Path) -> None: + # Arrange — both broken: schema mismatch AND no sidecar on disk. + engine = _write_engine(tmp_path, "bogus.engine") + entry = _entry_for(engine) + gate = EngineGate() + # Act / Assert + with pytest.raises(EngineSchemaMismatchError): + gate.validate(entry, _TIER2_HOST, _manifest_with(engine, "0" * 64)) + + +# ---------------------------------------------------------------------- +# Manifest reader coverage. + + +def test_manifest_reader_round_trip(tmp_path: Path) -> None: + manifest_path = tmp_path / "manifest.json" + payload = { + "root": str(tmp_path), + "entries": { + "ultravpr__sm87_jp6.2_trt10.3_fp16.engine": "a" * 64, + "lightglue__sm87_jp6.2_trt10.3_int8.engine": "b" * 64, + }, + } + manifest_path.write_text(json.dumps(payload), encoding="utf-8") + manifest = ManifestReader(manifest_path).read() + assert manifest.root == tmp_path + assert manifest["ultravpr__sm87_jp6.2_trt10.3_fp16.engine"] == "a" * 64 + assert "lightglue__sm87_jp6.2_trt10.3_int8.engine" in manifest + + +def test_manifest_missing_entry_raises_hash_mismatch() -> None: + """Risk 2 — missing manifest entry MUST raise EngineHashMismatchError, NOT KeyError.""" + manifest = DeploymentManifest(root=Path("/var/lib/x"), entries={}) + with pytest.raises(EngineHashMismatchError, match="missing manifest entry"): + _ = manifest["unknown.engine"] + + +def test_manifest_reader_rejects_malformed_json(tmp_path: Path) -> None: + bad = tmp_path / "manifest.json" + bad.write_text("not json {", encoding="utf-8") + with pytest.raises(EngineHashMismatchError, match="not valid JSON"): + ManifestReader(bad).read() + + +def test_manifest_reader_rejects_missing_entries_key(tmp_path: Path) -> None: + bad = tmp_path / "manifest.json" + bad.write_text(json.dumps({"root": "/x"}), encoding="utf-8") + with pytest.raises(EngineHashMismatchError, match="entries"): + ManifestReader(bad).read() + + +# ---------------------------------------------------------------------- +# Engine-outside-manifest-root path. + + +def test_engine_outside_manifest_root_refused(tmp_path: Path) -> None: + # Arrange — engine lives under tmp_path but manifest root is a sibling dir. + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + sha = _write_sidecar(engine) + sibling = tmp_path / "elsewhere" + sibling.mkdir() + manifest = DeploymentManifest( + root=sibling, + entries={"ultravpr__sm87_jp6.2_trt10.3_fp16.engine": sha}, + ) + gate = EngineGate() + with pytest.raises(EngineHashMismatchError, match="not under manifest root"): + gate.validate(_entry_for(engine), _TIER2_HOST, manifest) + + +# ---------------------------------------------------------------------- +# NFR-reliability — no writes by the gate. + + +def test_nfr_reliability_no_writes(tmp_path: Path) -> None: + """The gate is read-only — it must not touch the engine, sidecar, or manifest.""" + engine = _write_engine(tmp_path, "ultravpr__sm87_jp6.2_trt10.3_fp16.engine") + sha = _write_sidecar(engine) + sidecar = Path(str(engine) + ".sha256") + snapshot = { + "engine_mtime": engine.stat().st_mtime, + "engine_bytes": engine.read_bytes(), + "sidecar_mtime": sidecar.stat().st_mtime, + "sidecar_text": sidecar.read_text(encoding="utf-8"), + } + manifest = _manifest_with(engine, sha) + EngineGate().validate(_entry_for(engine), _TIER2_HOST, manifest) + assert engine.stat().st_mtime == snapshot["engine_mtime"] + assert engine.read_bytes() == snapshot["engine_bytes"] + assert sidecar.stat().st_mtime == snapshot["sidecar_mtime"] + assert sidecar.read_text(encoding="utf-8") == snapshot["sidecar_text"] + + +# ---------------------------------------------------------------------- +# Refusal log on error path. + + +def test_refusal_emits_error_log( + tmp_path: Path, caplog: pytest.LogCaptureFixture +) -> None: + engine = _write_engine(tmp_path, "bogus.engine") + gate = EngineGate() + with caplog.at_level(logging.ERROR, logger="c7_inference.gate"): + with pytest.raises(EngineSchemaMismatchError): + gate.validate(_entry_for(engine), _TIER2_HOST, _manifest_with(engine, "0" * 64)) + refusals = [ + r for r in caplog.records if getattr(r, "kind", None) == "c7.gate.refuse" + ] + assert len(refusals) == 1 + assert refusals[0].kv["step"] == "schema_parse" # type: ignore[attr-defined] + + +# ---------------------------------------------------------------------- +# AC-8 host-tuple read (Tier-2 hardware only). + + +@pytest.mark.tier2 +@pytest.mark.skipif( + os.environ.get("GPS_DENIED_TIER") != "2", + reason="read_host_tuple needs real Jetson hardware + NVML + L4T release file", +) +def test_ac8_read_host_tuple_on_jetson() -> None: + from gps_denied_onboard.components.c7_inference.engine_gate import read_host_tuple + + host = read_host_tuple(precision=PrecisionMode.FP16) + assert host.sm == 87 + assert host.jp == "6.2" + assert host.trt == "10.3" + assert host.precision is PrecisionMode.FP16