gps-denied-onboard/src/gps_denied_onboard/components/c10_provisioning/engine_compiler.py

"""C10 ``EngineCompiler`` — per-model TRT compile + hardware-tied cache reuse (AZ-321).

Public surface frozen by `_docs/02_document/components/11_c10_provisioning/description.md`
§5 (error handling) + §7 (D-C10-6 calibration-cache reuse, D-C10-7 self-describing
filename).

Responsibilities
----------------

For every :class:`BackboneSpec` in :class:`EngineCompileRequest` the
compiler:

1.  Computes the canonical engine filename via AZ-281
    :class:`EngineFilenameSchema` from the host's
    :class:`HostCapabilities` plus the request precision.
2.  If the engine is already on disk at
    ``{cache_root}/{filename}`` AND
    :meth:`Sha256Sidecar.verify` returns ``True`` for that path:
    treats it as a cache hit (``CompileOutcome.REUSED``) and returns a
    canonical :class:`EngineCacheEntry` synthesised from the sidecar.
    Zero calls to the injected :class:`InferenceRuntime`.
3.  Otherwise delegates to
    :meth:`InferenceRuntime.compile_engine` (AZ-298 / AZ-299 / AZ-300
    own the write path; the runtime atomically writes both the
    ``.engine`` binary and its ``.sha256`` sidecar). The compiler does
    NOT double-write the file — the task spec's "engine bytes are
    returned by compile_engine then written via the sidecar" wording
    contradicts the actual AZ-297 Protocol (``compile_engine`` returns
    an :class:`EngineCacheEntry`, not raw bytes); the Protocol shipped
    first and wins.

Hardware-tied cache reuse (D-C10-6) is satisfied by the filename
construction: an engine compiled on ``(sm=87, jp=6.2, trt=10.3, fp16)``
lives at a different path than one compiled on
``(sm=89, jp=6.3, trt=10.5, fp16)`` so a hardware change naturally
forces a rebuild — the compiler does NOT load nor delete stale
engines (AC-4).
"""

from __future__ import annotations

import logging
import time
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Protocol, runtime_checkable

from gps_denied_onboard._types.inference import (
    BuildConfig,
    EngineCacheEntry,
    OptimizationProfile,
    PrecisionMode,
)
from gps_denied_onboard._types.inference_errors import (
    CalibrationCacheError,
    EngineBuildError,
)
from gps_denied_onboard._types.manifests import HostCapabilities
from gps_denied_onboard.helpers.engine_filename_schema import (
    EngineFilenameSchema,
)
from gps_denied_onboard.helpers.sha256_sidecar import (
    Sha256Sidecar,
    Sha256SidecarError,
)

__all__ = [
    "BackboneSpec",
    "CompileEngineCallable",
    "CompileOutcome",
    "EngineCompileRequest",
    "EngineCompileResult",
    "EngineCompileSummary",
    "EngineCompiler",
]


_DEFAULT_WORKSPACE_MB: int = 4096


@runtime_checkable
class CompileEngineCallable(Protocol):
    """Structural cut of the C7 ``InferenceRuntime`` Protocol (AZ-297).

    The compiler only ever calls
    :meth:`InferenceRuntime.compile_engine`, so it accepts any object
    that structurally satisfies this narrow Protocol. This keeps the
    c10 component free of cross-component imports (architecture rule
    ``test_az270_compose_root.test_ac6``) while still letting the real
    :class:`gps_denied_onboard.components.c7_inference.InferenceRuntime`
    plug in unchanged via duck typing — the composition root wires the
    concrete strategy in. Same dual-Protocol pattern used by the
    LightGlue ``EngineHandle`` consumer cut in ``_types/manifests.py``.
    """

    def compile_engine(
        self, model_path: Path, build_config: BuildConfig
    ) -> EngineCacheEntry: ...


class CompileOutcome(str, Enum):
    """Per-backbone outcome of one ``compile_engines_for_corpus`` call."""

    BUILT = "built"
    REUSED = "reused"


@dataclass(frozen=True)
class BackboneSpec:
    """One model the corpus needs an engine for.

    ``input_name`` defaults to ``"input"`` because most exported ONNX
    graphs in this project use that name; backbones with a different
    input name must override it. ``expected_input_shape`` is used to
    synthesise a single :class:`OptimizationProfile` with
    ``min == opt == max``; backbones that need explicit dynamic ranges
    should be split into separate :class:`OptimizationProfile`-aware
    helpers and supplied via ``custom_profiles`` (out of scope for the
    AZ-321 corpus; reserved for a later extension).
    """

    model_name: str
    onnx_path: Path
    expected_input_shape: tuple[int, ...]
    input_name: str = "input"


@dataclass(frozen=True)
class EngineCompileRequest:
    """Inputs to one ``compile_engines_for_corpus`` invocation.

    ``host`` is passed in (rather than introspected via the runtime)
    because the AZ-297 :class:`InferenceRuntime` Protocol does not
    expose host-info; the composition root resolves
    :class:`HostCapabilities` from device probes (Tier-2) or test
    fixtures (Tier-1) and threads it through here. This keeps the
    compiler decoupled from the runtime's introspection surface and
    makes the AC-4 (hardware change) test trivial.
    """

    backbones: tuple[BackboneSpec, ...]
    calibration_path: Path | None
    cache_root: Path
    precision: PrecisionMode
    host: HostCapabilities
    workspace_mb: int = _DEFAULT_WORKSPACE_MB


@dataclass(frozen=True)
class EngineCompileResult:
    """One backbone's outcome record after ``compile_engines_for_corpus``.

    ``entry`` is the canonical
    :class:`gps_denied_onboard._types.inference.EngineCacheEntry` —
    same shape whether the engine was freshly built or reused. The
    surrounding ``outcome`` + ``compile_duration_s`` are c10-local
    bookkeeping (the AZ-321 task spec called this combined record
    ``EngineCacheEntry`` but that name is already taken by the AZ-297
    canonical DTO; the canonical shape wins and the wrapper takes a
    new name).
    """

    entry: EngineCacheEntry
    outcome: CompileOutcome
    compile_duration_s: float | None


@dataclass(frozen=True)
class EngineCompileSummary:
    """Aggregate counts surfaced via the ``c10.engine.compile.summary`` log."""

    engines_built: int
    engines_reused: int
    cache_hit_ratio: float


class EngineCompiler:
    """Compile or reuse TensorRT engines for every backbone in a corpus.

    The compiler is stateless across calls; ``__init__`` only injects
    the collaborators it cannot construct itself
    (the :class:`InferenceRuntime` is composition-root-owned; the
    logger is named per component).
    """

    def __init__(
        self,
        *,
        inference_runtime: CompileEngineCallable,
        logger: logging.Logger,
    ) -> None:
        self._runtime = inference_runtime
        self._log = logger

    def compile_engines_for_corpus(
        self, request: EngineCompileRequest
    ) -> tuple[EngineCompileResult, ...]:
        """Compile or reuse one engine per backbone in ``request.backbones``.

        Empty ``backbones`` → empty result and a summary log with
        all-zero counts (AC-10). Errors from
        :meth:`InferenceRuntime.compile_engine` are NOT caught here —
        they propagate to the caller (AC-6 / AC-7). Side effects on
        backbones implemented before the failing one are visible on
        disk; the compiler does NOT roll back (AZ-298's atomic-write
        guarantees no half-engine).
        """

        engines_dir = request.cache_root
        engines_dir.mkdir(parents=True, exist_ok=True)
        results: list[EngineCompileResult] = []
        for backbone in request.backbones:
            result = self._compile_one(backbone, request)
            results.append(result)

        summary = _summarise(results)
        self._log.info(
            "c10.engine.compile.summary",
            extra={
                "kind": "c10.engine.compile.summary",
                "kv": {
                    "engines_built": summary.engines_built,
                    "engines_reused": summary.engines_reused,
                    "cache_hit_ratio": summary.cache_hit_ratio,
                    "total": len(results),
                },
            },
        )
        return tuple(results)

    def _compile_one(
        self,
        backbone: BackboneSpec,
        request: EngineCompileRequest,
    ) -> EngineCompileResult:
        filename = EngineFilenameSchema.build(
            model_name=backbone.model_name,
            sm=request.host.sm,
            jetpack=request.host.jetpack,
            trt=request.host.trt,
            precision=request.precision.value,
        )
        target_path = request.cache_root / filename

        cache_hit_entry = self._maybe_reuse(
            target_path, backbone, request
        )
        if cache_hit_entry is not None:
            self._log.info(
                "c10.engine.cache.hit",
                extra={
                    "kind": "c10.engine.cache.hit",
                    "kv": {
                        "model_name": backbone.model_name,
                        "engine_path": str(target_path),
                    },
                },
            )
            return EngineCompileResult(
                entry=cache_hit_entry,
                outcome=CompileOutcome.REUSED,
                compile_duration_s=None,
            )

        self._log.warning(
            "c10.engine.cache.miss",
            extra={
                "kind": "c10.engine.cache.miss",
                "kv": {
                    "model_name": backbone.model_name,
                    "target_filename": filename,
                },
            },
        )
        build_config = _build_config_for_backbone(backbone, request)
        t0 = time.perf_counter()
        try:
            entry = self._runtime.compile_engine(
                backbone.onnx_path, build_config
            )
        except (EngineBuildError, CalibrationCacheError) as exc:
            # AZ-507 narrowed the catch to the documented C7 typed-error
            # envelope (`_types/inference_errors.py` re-exports
            # `EngineBuildError` + `CalibrationCacheError` from
            # `c7_inference.errors` without violating the AZ-270 lint).
            # Unknown exceptions intentionally propagate unhandled — they
            # are programmer errors, not C7 contract failures, and must
            # not be swallowed under a structured "compile.error" log.
            self._log.error(
                "c10.engine.compile.error",
                extra={
                    "kind": "c10.engine.compile.error",
                    "kv": {
                        "model_name": backbone.model_name,
                        "calibration_path": (
                            str(request.calibration_path)
                            if request.calibration_path is not None
                            else None
                        ),
                        "error_class": type(exc).__name__,
                        "message": str(exc),
                    },
                },
            )
            raise
        elapsed_s = time.perf_counter() - t0
        return EngineCompileResult(
            entry=entry,
            outcome=CompileOutcome.BUILT,
            compile_duration_s=elapsed_s,
        )

    def _maybe_reuse(
        self,
        target_path: Path,
        backbone: BackboneSpec,
        request: EngineCompileRequest,
    ) -> EngineCacheEntry | None:
        """Return a synthesised :class:`EngineCacheEntry` on cache hit; ``None`` on miss.

        Side effect: emits a WARN log on a tampered / missing sidecar
        (the engine file exists but its sidecar is invalid). The
        recompile-on-miss branch is owned by the caller.
        """

        if not target_path.exists():
            return None
        try:
            verified = Sha256Sidecar.verify(target_path)
        except Sha256SidecarError as exc:
            self._log.warning(
                "c10.engine.sidecar.mismatch",
                extra={
                    "kind": "c10.engine.sidecar.mismatch",
                    "kv": {
                        "model_name": backbone.model_name,
                        "engine_path": str(target_path),
                        "reason": str(exc),
                    },
                },
            )
            return None
        if not verified:
            self._log.warning(
                "c10.engine.sidecar.mismatch",
                extra={
                    "kind": "c10.engine.sidecar.mismatch",
                    "kv": {
                        "model_name": backbone.model_name,
                        "engine_path": str(target_path),
                        "reason": "digest_mismatch",
                    },
                },
            )
            return None
        sidecar_text = (
            Path(str(target_path) + ".sha256").read_text().strip()
        )
        return EngineCacheEntry(
            engine_path=target_path,
            sha256_hex=sidecar_text,
            sm=request.host.sm,
            jp=request.host.jetpack,
            trt=request.host.trt,
            precision=request.precision,
            extras={},
        )


def _build_config_for_backbone(
    backbone: BackboneSpec, request: EngineCompileRequest
) -> BuildConfig:
    """Synthesise a :class:`BuildConfig` from a :class:`BackboneSpec`.

    Constructs exactly one :class:`OptimizationProfile` with
    ``min == opt == max == expected_input_shape``; backbones with
    dynamic input ranges are out of scope for AZ-321 and would need
    a richer ``BackboneSpec`` variant.
    """

    profile = OptimizationProfile(
        input_name=backbone.input_name,
        min_shape=backbone.expected_input_shape,
        opt_shape=backbone.expected_input_shape,
        max_shape=backbone.expected_input_shape,
    )
    return BuildConfig(
        precision=request.precision,
        workspace_mb=request.workspace_mb,
        calibration_dataset=request.calibration_path,
        optimization_profiles=(profile,),
    )


def _summarise(
    results: list[EngineCompileResult],
) -> EngineCompileSummary:
    built = sum(
        1 for r in results if r.outcome is CompileOutcome.BUILT
    )
    reused = sum(
        1 for r in results if r.outcome is CompileOutcome.REUSED
    )
    total = len(results)
    ratio = reused / total if total > 0 else 0.0
    return EngineCompileSummary(
        engines_built=built,
        engines_reused=reused,
        cache_hit_ratio=ratio,
    )