mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 08:21:13 +00:00
e2bebefdfc
AZ-507: codify cross-component import rule. Added _types/inference_errors.py shim re-exporting EngineBuildError + CalibrationCacheError from c7_inference; narrowed C10 EngineCompiler's except Exception to the two typed errors so unknown exceptions propagate (AC-3). Rewrote module-layout.md "Imports from" sections for 9 components + added Rule 9; appended an architecture.md ADR-009 note explaining why components must go through _types/*. AZ-323: ManifestBuilder + Ed25519ManifestSigner. Canonical JSON via orjson OPT_SORT_KEYS+OPT_INDENT_2, atomic-write Manifest.json + sha sidecar + .sig via AZ-280, operator-key fingerprint allowlist gate (C10-ST-01), ADR-010 takeoff_origin + flight_id baked into Manifest AND manifest_hash so re-planned routes change the cache identity (AC-15/AC-16). 20 unit tests cover all 16 ACs. AZ-324: ManifestVerifierImpl. Fail-closed Steps A-D: Manifest.json sidecar self-hash, Ed25519 trust-key set, schema parse with absolute/.. path rejection + takeoff_origin in-bbox check, stream SHA-256 per artifact with multi-failure accumulation. Operator mode re-derives tiles_coverage_sha256 from C6; airborne mode trusts the signed aggregate. 19 unit tests cover all 17 ACs. Composition root: c10_factory.build_manifest_builder + build_manifest_verifier + c6_tile_metadata_store_to_tiles_query adapter (the one place that legitimately imports both C6 and C10 without violating the AZ-270 lint). Dependency: pinned cryptography>=43.0,<46.0 in pyproject.toml. Tests: 1300 passed, 80 skipped (env-only), ruff clean for all AZ-323/324 files. AZ-306 (FAISS) intentionally deferred to batch 35 — needs C++ pybind11 toolchain not present in this environment. Co-authored-by: Cursor <cursoragent@cursor.com>
412 lines
14 KiB
Python
412 lines
14 KiB
Python
"""C10 ``EngineCompiler`` — per-model TRT compile + hardware-tied cache reuse (AZ-321).
|
|
|
|
Public surface frozen by `_docs/02_document/components/11_c10_provisioning/description.md`
|
|
§5 (error handling) + §7 (D-C10-6 calibration-cache reuse, D-C10-7 self-describing
|
|
filename).
|
|
|
|
Responsibilities
|
|
----------------
|
|
|
|
For every :class:`BackboneSpec` in :class:`EngineCompileRequest` the
|
|
compiler:
|
|
|
|
1. Computes the canonical engine filename via AZ-281
|
|
:class:`EngineFilenameSchema` from the host's
|
|
:class:`HostCapabilities` plus the request precision.
|
|
2. If the engine is already on disk at
|
|
``{cache_root}/{filename}`` AND
|
|
:meth:`Sha256Sidecar.verify` returns ``True`` for that path:
|
|
treats it as a cache hit (``CompileOutcome.REUSED``) and returns a
|
|
canonical :class:`EngineCacheEntry` synthesised from the sidecar.
|
|
Zero calls to the injected :class:`InferenceRuntime`.
|
|
3. Otherwise delegates to
|
|
:meth:`InferenceRuntime.compile_engine` (AZ-298 / AZ-299 / AZ-300
|
|
own the write path; the runtime atomically writes both the
|
|
``.engine`` binary and its ``.sha256`` sidecar). The compiler does
|
|
NOT double-write the file — the task spec's "engine bytes are
|
|
returned by compile_engine then written via the sidecar" wording
|
|
contradicts the actual AZ-297 Protocol (``compile_engine`` returns
|
|
an :class:`EngineCacheEntry`, not raw bytes); the Protocol shipped
|
|
first and wins.
|
|
|
|
Hardware-tied cache reuse (D-C10-6) is satisfied by the filename
|
|
construction: an engine compiled on ``(sm=87, jp=6.2, trt=10.3, fp16)``
|
|
lives at a different path than one compiled on
|
|
``(sm=89, jp=6.3, trt=10.5, fp16)`` so a hardware change naturally
|
|
forces a rebuild — the compiler does NOT load nor delete stale
|
|
engines (AC-4).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Protocol, runtime_checkable
|
|
|
|
from gps_denied_onboard._types.inference import (
|
|
BuildConfig,
|
|
EngineCacheEntry,
|
|
OptimizationProfile,
|
|
PrecisionMode,
|
|
)
|
|
from gps_denied_onboard._types.inference_errors import (
|
|
CalibrationCacheError,
|
|
EngineBuildError,
|
|
)
|
|
from gps_denied_onboard._types.manifests import HostCapabilities
|
|
from gps_denied_onboard.helpers.engine_filename_schema import (
|
|
EngineFilenameSchema,
|
|
)
|
|
from gps_denied_onboard.helpers.sha256_sidecar import (
|
|
Sha256Sidecar,
|
|
Sha256SidecarError,
|
|
)
|
|
|
|
__all__ = [
|
|
"BackboneSpec",
|
|
"CompileEngineCallable",
|
|
"CompileOutcome",
|
|
"EngineCompileRequest",
|
|
"EngineCompileResult",
|
|
"EngineCompileSummary",
|
|
"EngineCompiler",
|
|
]
|
|
|
|
|
|
_DEFAULT_WORKSPACE_MB: int = 4096
|
|
|
|
|
|
@runtime_checkable
|
|
class CompileEngineCallable(Protocol):
|
|
"""Structural cut of the C7 ``InferenceRuntime`` Protocol (AZ-297).
|
|
|
|
The compiler only ever calls
|
|
:meth:`InferenceRuntime.compile_engine`, so it accepts any object
|
|
that structurally satisfies this narrow Protocol. This keeps the
|
|
c10 component free of cross-component imports (architecture rule
|
|
``test_az270_compose_root.test_ac6``) while still letting the real
|
|
:class:`gps_denied_onboard.components.c7_inference.InferenceRuntime`
|
|
plug in unchanged via duck typing — the composition root wires the
|
|
concrete strategy in. Same dual-Protocol pattern used by the
|
|
LightGlue ``EngineHandle`` consumer cut in ``_types/manifests.py``.
|
|
"""
|
|
|
|
def compile_engine(
|
|
self, model_path: Path, build_config: BuildConfig
|
|
) -> EngineCacheEntry: ...
|
|
|
|
|
|
class CompileOutcome(str, Enum):
|
|
"""Per-backbone outcome of one ``compile_engines_for_corpus`` call."""
|
|
|
|
BUILT = "built"
|
|
REUSED = "reused"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BackboneSpec:
|
|
"""One model the corpus needs an engine for.
|
|
|
|
``input_name`` defaults to ``"input"`` because most exported ONNX
|
|
graphs in this project use that name; backbones with a different
|
|
input name must override it. ``expected_input_shape`` is used to
|
|
synthesise a single :class:`OptimizationProfile` with
|
|
``min == opt == max``; backbones that need explicit dynamic ranges
|
|
should be split into separate :class:`OptimizationProfile`-aware
|
|
helpers and supplied via ``custom_profiles`` (out of scope for the
|
|
AZ-321 corpus; reserved for a later extension).
|
|
"""
|
|
|
|
model_name: str
|
|
onnx_path: Path
|
|
expected_input_shape: tuple[int, ...]
|
|
input_name: str = "input"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EngineCompileRequest:
|
|
"""Inputs to one ``compile_engines_for_corpus`` invocation.
|
|
|
|
``host`` is passed in (rather than introspected via the runtime)
|
|
because the AZ-297 :class:`InferenceRuntime` Protocol does not
|
|
expose host-info; the composition root resolves
|
|
:class:`HostCapabilities` from device probes (Tier-2) or test
|
|
fixtures (Tier-1) and threads it through here. This keeps the
|
|
compiler decoupled from the runtime's introspection surface and
|
|
makes the AC-4 (hardware change) test trivial.
|
|
"""
|
|
|
|
backbones: tuple[BackboneSpec, ...]
|
|
calibration_path: Path | None
|
|
cache_root: Path
|
|
precision: PrecisionMode
|
|
host: HostCapabilities
|
|
workspace_mb: int = _DEFAULT_WORKSPACE_MB
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EngineCompileResult:
|
|
"""One backbone's outcome record after ``compile_engines_for_corpus``.
|
|
|
|
``entry`` is the canonical
|
|
:class:`gps_denied_onboard._types.inference.EngineCacheEntry` —
|
|
same shape whether the engine was freshly built or reused. The
|
|
surrounding ``outcome`` + ``compile_duration_s`` are c10-local
|
|
bookkeeping (the AZ-321 task spec called this combined record
|
|
``EngineCacheEntry`` but that name is already taken by the AZ-297
|
|
canonical DTO; the canonical shape wins and the wrapper takes a
|
|
new name).
|
|
"""
|
|
|
|
entry: EngineCacheEntry
|
|
outcome: CompileOutcome
|
|
compile_duration_s: float | None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class EngineCompileSummary:
|
|
"""Aggregate counts surfaced via the ``c10.engine.compile.summary`` log."""
|
|
|
|
engines_built: int
|
|
engines_reused: int
|
|
cache_hit_ratio: float
|
|
|
|
|
|
class EngineCompiler:
|
|
"""Compile or reuse TensorRT engines for every backbone in a corpus.
|
|
|
|
The compiler is stateless across calls; ``__init__`` only injects
|
|
the collaborators it cannot construct itself
|
|
(the :class:`InferenceRuntime` is composition-root-owned; the
|
|
logger is named per component).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
inference_runtime: CompileEngineCallable,
|
|
logger: logging.Logger,
|
|
) -> None:
|
|
self._runtime = inference_runtime
|
|
self._log = logger
|
|
|
|
def compile_engines_for_corpus(
|
|
self, request: EngineCompileRequest
|
|
) -> tuple[EngineCompileResult, ...]:
|
|
"""Compile or reuse one engine per backbone in ``request.backbones``.
|
|
|
|
Empty ``backbones`` → empty result and a summary log with
|
|
all-zero counts (AC-10). Errors from
|
|
:meth:`InferenceRuntime.compile_engine` are NOT caught here —
|
|
they propagate to the caller (AC-6 / AC-7). Side effects on
|
|
backbones implemented before the failing one are visible on
|
|
disk; the compiler does NOT roll back (AZ-298's atomic-write
|
|
guarantees no half-engine).
|
|
"""
|
|
|
|
engines_dir = request.cache_root
|
|
engines_dir.mkdir(parents=True, exist_ok=True)
|
|
results: list[EngineCompileResult] = []
|
|
for backbone in request.backbones:
|
|
result = self._compile_one(backbone, request)
|
|
results.append(result)
|
|
|
|
summary = _summarise(results)
|
|
self._log.info(
|
|
"c10.engine.compile.summary",
|
|
extra={
|
|
"kind": "c10.engine.compile.summary",
|
|
"kv": {
|
|
"engines_built": summary.engines_built,
|
|
"engines_reused": summary.engines_reused,
|
|
"cache_hit_ratio": summary.cache_hit_ratio,
|
|
"total": len(results),
|
|
},
|
|
},
|
|
)
|
|
return tuple(results)
|
|
|
|
def _compile_one(
|
|
self,
|
|
backbone: BackboneSpec,
|
|
request: EngineCompileRequest,
|
|
) -> EngineCompileResult:
|
|
filename = EngineFilenameSchema.build(
|
|
model_name=backbone.model_name,
|
|
sm=request.host.sm,
|
|
jetpack=request.host.jetpack,
|
|
trt=request.host.trt,
|
|
precision=request.precision.value,
|
|
)
|
|
target_path = request.cache_root / filename
|
|
|
|
cache_hit_entry = self._maybe_reuse(
|
|
target_path, backbone, request
|
|
)
|
|
if cache_hit_entry is not None:
|
|
self._log.info(
|
|
"c10.engine.cache.hit",
|
|
extra={
|
|
"kind": "c10.engine.cache.hit",
|
|
"kv": {
|
|
"model_name": backbone.model_name,
|
|
"engine_path": str(target_path),
|
|
},
|
|
},
|
|
)
|
|
return EngineCompileResult(
|
|
entry=cache_hit_entry,
|
|
outcome=CompileOutcome.REUSED,
|
|
compile_duration_s=None,
|
|
)
|
|
|
|
self._log.warning(
|
|
"c10.engine.cache.miss",
|
|
extra={
|
|
"kind": "c10.engine.cache.miss",
|
|
"kv": {
|
|
"model_name": backbone.model_name,
|
|
"target_filename": filename,
|
|
},
|
|
},
|
|
)
|
|
build_config = _build_config_for_backbone(backbone, request)
|
|
t0 = time.perf_counter()
|
|
try:
|
|
entry = self._runtime.compile_engine(
|
|
backbone.onnx_path, build_config
|
|
)
|
|
except (EngineBuildError, CalibrationCacheError) as exc:
|
|
# AZ-507 narrowed the catch to the documented C7 typed-error
|
|
# envelope (`_types/inference_errors.py` re-exports
|
|
# `EngineBuildError` + `CalibrationCacheError` from
|
|
# `c7_inference.errors` without violating the AZ-270 lint).
|
|
# Unknown exceptions intentionally propagate unhandled — they
|
|
# are programmer errors, not C7 contract failures, and must
|
|
# not be swallowed under a structured "compile.error" log.
|
|
self._log.error(
|
|
"c10.engine.compile.error",
|
|
extra={
|
|
"kind": "c10.engine.compile.error",
|
|
"kv": {
|
|
"model_name": backbone.model_name,
|
|
"calibration_path": (
|
|
str(request.calibration_path)
|
|
if request.calibration_path is not None
|
|
else None
|
|
),
|
|
"error_class": type(exc).__name__,
|
|
"message": str(exc),
|
|
},
|
|
},
|
|
)
|
|
raise
|
|
elapsed_s = time.perf_counter() - t0
|
|
return EngineCompileResult(
|
|
entry=entry,
|
|
outcome=CompileOutcome.BUILT,
|
|
compile_duration_s=elapsed_s,
|
|
)
|
|
|
|
def _maybe_reuse(
|
|
self,
|
|
target_path: Path,
|
|
backbone: BackboneSpec,
|
|
request: EngineCompileRequest,
|
|
) -> EngineCacheEntry | None:
|
|
"""Return a synthesised :class:`EngineCacheEntry` on cache hit; ``None`` on miss.
|
|
|
|
Side effect: emits a WARN log on a tampered / missing sidecar
|
|
(the engine file exists but its sidecar is invalid). The
|
|
recompile-on-miss branch is owned by the caller.
|
|
"""
|
|
|
|
if not target_path.exists():
|
|
return None
|
|
try:
|
|
verified = Sha256Sidecar.verify(target_path)
|
|
except Sha256SidecarError as exc:
|
|
self._log.warning(
|
|
"c10.engine.sidecar.mismatch",
|
|
extra={
|
|
"kind": "c10.engine.sidecar.mismatch",
|
|
"kv": {
|
|
"model_name": backbone.model_name,
|
|
"engine_path": str(target_path),
|
|
"reason": str(exc),
|
|
},
|
|
},
|
|
)
|
|
return None
|
|
if not verified:
|
|
self._log.warning(
|
|
"c10.engine.sidecar.mismatch",
|
|
extra={
|
|
"kind": "c10.engine.sidecar.mismatch",
|
|
"kv": {
|
|
"model_name": backbone.model_name,
|
|
"engine_path": str(target_path),
|
|
"reason": "digest_mismatch",
|
|
},
|
|
},
|
|
)
|
|
return None
|
|
sidecar_text = (
|
|
Path(str(target_path) + ".sha256").read_text().strip()
|
|
)
|
|
return EngineCacheEntry(
|
|
engine_path=target_path,
|
|
sha256_hex=sidecar_text,
|
|
sm=request.host.sm,
|
|
jp=request.host.jetpack,
|
|
trt=request.host.trt,
|
|
precision=request.precision,
|
|
extras={},
|
|
)
|
|
|
|
|
|
def _build_config_for_backbone(
|
|
backbone: BackboneSpec, request: EngineCompileRequest
|
|
) -> BuildConfig:
|
|
"""Synthesise a :class:`BuildConfig` from a :class:`BackboneSpec`.
|
|
|
|
Constructs exactly one :class:`OptimizationProfile` with
|
|
``min == opt == max == expected_input_shape``; backbones with
|
|
dynamic input ranges are out of scope for AZ-321 and would need
|
|
a richer ``BackboneSpec`` variant.
|
|
"""
|
|
|
|
profile = OptimizationProfile(
|
|
input_name=backbone.input_name,
|
|
min_shape=backbone.expected_input_shape,
|
|
opt_shape=backbone.expected_input_shape,
|
|
max_shape=backbone.expected_input_shape,
|
|
)
|
|
return BuildConfig(
|
|
precision=request.precision,
|
|
workspace_mb=request.workspace_mb,
|
|
calibration_dataset=request.calibration_path,
|
|
optimization_profiles=(profile,),
|
|
)
|
|
|
|
|
|
def _summarise(
|
|
results: list[EngineCompileResult],
|
|
) -> EngineCompileSummary:
|
|
built = sum(
|
|
1 for r in results if r.outcome is CompileOutcome.BUILT
|
|
)
|
|
reused = sum(
|
|
1 for r in results if r.outcome is CompileOutcome.REUSED
|
|
)
|
|
total = len(results)
|
|
ratio = reused / total if total > 0 else 0.0
|
|
return EngineCompileSummary(
|
|
engines_built=built,
|
|
engines_reused=reused,
|
|
cache_hit_ratio=ratio,
|
|
)
|