[AZ-322] C10 DescriptorBatcher (faiss-cpu, OOM halve-retry)

Implements the C10 internal phase that walks every C6 tile, embeds through C2's backbone via the AZ-321-produced engine, and rebuilds the AZ-306 FAISS HNSW index in one atomic write. - DescriptorBatcher with halve-and-retry OOM recovery (default 1 retry) - BackboneEmbedder Protocol + C7EngineBackboneEmbedder default impl - DescriptorBatchError for OOM / dim-mismatch / missing-output failures - Empty-corpus surfaces as outcome=failure with explicit hint to run C11 - Per-10% progress callback + DEBUG logs (no engine bytes leaked) - Consumer-side Protocol cuts (TilesByBboxBatchQuery, TilePixelOpener, DescriptorIndexRebuilder) so c10 stays within AZ-270 lint - runtime_root.c10_factory adds build_descriptor_batcher + three C6->C10 adapters - 16 unit tests covering AC-1..AC-10 + 2 NFRs + 4 supplemental (Protocol conformance, query pass-through, handle release, config) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 19:41:14 +00:00 · 2026-05-13 04:20:47 +03:00
parent 3b7265757b
commit f01a5058ab
12 changed files with 1733 additions and 10 deletions
@@ -11,12 +11,27 @@ them through this single contract surface.

 from gps_denied_onboard._types.inference import EngineCacheEntry
 from gps_denied_onboard._types.manifests import Manifest
+from gps_denied_onboard.components.c10_provisioning.c7_engine_embedder import (
+    C7EngineBackboneEmbedder,
+)
 from gps_denied_onboard.components.c10_provisioning.config import (
    BackboneConfig,
    C10ManifestConfig,
    C10ProvisioningConfig,
    SigningMode,
 )
+from gps_denied_onboard.components.c10_provisioning.descriptor_batcher import (
+    BatcherTile,
+    C10BatcherConfig,
+    CorpusFilter,
+    DescriptorBatcher,
+    DescriptorBatchReport,
+    DescriptorIndexRebuilder,
+    ProgressEvent,
+    TileBboxRecord,
+    TilePixelOpener,
+    TilesByBboxBatchQuery,
+)
 from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
    BackboneSpec,
    CompileEngineCallable,
@@ -28,9 +43,11 @@ from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
 )
 from gps_denied_onboard.components.c10_provisioning.errors import (
    C10ProvisioningError,
+    DescriptorBatchError,
    ManifestWriteError,
 )
 from gps_denied_onboard.components.c10_provisioning.interface import (
+    BackboneEmbedder,
    CacheProvisioner,
    ManifestSigner,
    SigningKeyHandle,
@@ -60,13 +77,22 @@ __all__ = [
    "VALID_SECTOR_CLASSES",
    "ArtifactCheck",
    "BackboneConfig",
+    "BackboneEmbedder",
    "BackboneSpec",
+    "BatcherTile",
+    "C7EngineBackboneEmbedder",
+    "C10BatcherConfig",
    "C10ManifestConfig",
    "C10ProvisioningConfig",
    "C10ProvisioningError",
    "CacheProvisioner",
    "CompileEngineCallable",
    "CompileOutcome",
+    "CorpusFilter",
+    "DescriptorBatchError",
+    "DescriptorBatchReport",
+    "DescriptorBatcher",
+    "DescriptorIndexRebuilder",
    "Ed25519ManifestSigner",
    "EngineCacheEntry",
    "EngineCompileRequest",
@@ -81,9 +107,13 @@ __all__ = [
    "ManifestVerifier",
    "ManifestVerifierImpl",
    "ManifestWriteError",
+    "ProgressEvent",
    "SigningKeyHandle",
    "SigningMode",
+    "TileBboxRecord",
    "TileHashRecord",
+    "TilePixelOpener",
+    "TilesByBboxBatchQuery",
    "TilesByBboxQuery",
    "VerificationResult",
    "VerifyFailReason",
@@ -0,0 +1,150 @@
+"""``C7EngineBackboneEmbedder`` (AZ-322).
+
+Default :class:`BackboneEmbedder` implementation: wraps an AZ-321-produced
+engine + an AZ-297 :class:`InferenceRuntime` and turns
+``list[TilePixelHandle]`` into ``np.ndarray`` of shape
+``(batch_size, descriptor_dim)``.
+
+JPEG → tensor preprocessing is **not** owned here — it is the
+:class:`BackboneSpec` consumer's responsibility (the C2 VPR backbone in
+AZ-255 will eventually own its own normalization). Until E-C2 ships,
+the composition root injects a ``tile_decoder`` callable so this
+adapter stays free of OpenCV / Pillow / torchvision imports and the
+test surface stays narrow. Risk-1 in the AZ-322 spec mitigation.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+from typing import Any
+
+import numpy as np
+
+from gps_denied_onboard.components.c10_provisioning.errors import (
+    DescriptorBatchError,
+)
+
+# AZ-322: ``InferenceRuntime`` (c7) and ``EngineHandle`` (_types) are
+# REFERENCED only in annotations. Importing them at runtime — even
+# under ``TYPE_CHECKING`` — would cross the AZ-270 component boundary
+# (the AST lint flags TYPE_CHECKING imports too, conservatively). We
+# instead duck-type these via ``Any`` and rely on structural calls
+# (``inference_runtime.infer(handle, dict) -> dict``); the
+# composition root (``runtime_root.c10_factory``) wires the concrete
+# c7 instance in.
+
+__all__ = ["C7EngineBackboneEmbedder"]
+
+
+_OOM_MARKERS = ("CUDA out of memory", "OutOfMemoryError", "OOM")
+
+
+class C7EngineBackboneEmbedder:
+    """Thin adapter from AZ-297's :class:`InferenceRuntime` to
+    :class:`BackboneEmbedder`.
+
+    Construction owns one :class:`EngineHandle` for the lifetime of
+    the embedder (one batcher session). ``embed_batch`` decodes the
+    incoming tile handles via the injected ``tile_decoder`` callable,
+    stacks them into a batch tensor, and runs ``infer`` once.
+
+    The output tensor is read from ``outputs[output_name]`` —
+    exposing ``output_name`` keeps the adapter portable across
+    backbones (DINOv2-VPR uses ``"descriptor"``, others may differ).
+
+    OOM rewrap: any :class:`gps_denied_onboard.components.c7_inference.errors.OutOfMemoryError`
+    OR an exception whose ``str`` contains an OOM marker is rewrapped
+    as :class:`DescriptorBatchError("CUDA OOM at batch_size=N")` so
+    :class:`DescriptorBatcher`'s halve-and-retry catches it (AC-2).
+    """
+
+    def __init__(
+        self,
+        *,
+        inference_runtime: Any,
+        engine_handle: Any,
+        input_name: str,
+        output_name: str,
+        descriptor_dim: int,
+        tile_decoder: Callable[[Any], np.ndarray],
+        logger: logging.Logger,
+    ) -> None:
+        if descriptor_dim <= 0:
+            raise ValueError(
+                f"descriptor_dim must be positive; got {descriptor_dim}"
+            )
+        self._runtime = inference_runtime
+        self._handle = engine_handle
+        self._input_name = input_name
+        self._output_name = output_name
+        self._descriptor_dim = descriptor_dim
+        self._tile_decoder = tile_decoder
+        self._logger = logger
+
+    def embed_batch(self, tiles: list[Any]) -> np.ndarray:
+        if not tiles:
+            return np.empty((0, self._descriptor_dim), dtype=np.float32)
+
+        batch = self._stack_batch(tiles)
+        try:
+            outputs = self._runtime.infer(
+                self._handle, {self._input_name: batch}
+            )
+        except Exception as exc:  # rewrap OOM; surface everything else
+            if _looks_like_oom(exc):
+                raise DescriptorBatchError(
+                    f"CUDA OOM at batch_size={len(tiles)}"
+                ) from exc
+            raise
+
+        if self._output_name not in outputs:
+            raise DescriptorBatchError(
+                f"engine output dict missing key {self._output_name!r}; "
+                f"available keys = {list(outputs.keys())}"
+            )
+        descriptors = outputs[self._output_name]
+        if not isinstance(descriptors, np.ndarray):
+            raise DescriptorBatchError(
+                f"engine output {self._output_name!r} is not an ndarray; "
+                f"got {type(descriptors).__name__}"
+            )
+        if descriptors.ndim != 2 or descriptors.shape[0] != len(tiles):
+            raise DescriptorBatchError(
+                f"engine output shape {descriptors.shape} does not match "
+                f"expected (batch={len(tiles)}, dim={self._descriptor_dim})"
+            )
+        if descriptors.dtype != np.float32:
+            descriptors = descriptors.astype(np.float32, copy=False)
+        return descriptors
+
+    def descriptor_dim(self) -> int:
+        return self._descriptor_dim
+
+    def _stack_batch(self, tiles: list[Any]) -> np.ndarray:
+        decoded = [self._tile_decoder(handle) for handle in tiles]
+        if not decoded:
+            return np.empty((0,), dtype=np.float32)
+        first_shape = decoded[0].shape
+        for i, arr in enumerate(decoded[1:], start=1):
+            if arr.shape != first_shape:
+                raise DescriptorBatchError(
+                    f"tile_decoder returned shape mismatch at index {i}: "
+                    f"{arr.shape} vs first {first_shape}"
+                )
+        return np.stack(decoded, axis=0)
+
+
+def _looks_like_oom(exc: BaseException) -> bool:
+    """Detect OOM by exception type name OR message marker.
+
+    The C7 contract names the canonical exception
+    :class:`OutOfMemoryError`; back-end SDKs occasionally raise raw
+    :class:`RuntimeError` with a message describing OOM (PyTorch
+    historically does this). We accept both so the AC-2 retry loop
+    catches the failure regardless of the underlying SDK.
+    """
+    if type(exc).__name__ == "OutOfMemoryError":
+        return True
+    message = str(exc)
+    return any(marker in message for marker in _OOM_MARKERS)
@@ -0,0 +1,522 @@
+"""C10 ``DescriptorBatcher`` — embed corpus + rebuild FAISS index (AZ-322).
+
+The pre-flight phase that walks every C6 tile in
+``(bbox, zoom_levels)``, runs them through the C2 backbone (via the
+AZ-321-produced engine) in batches sized for the operator workstation,
+and rebuilds the FAISS HNSW index via AZ-303 / AZ-306's
+:meth:`DescriptorIndex.rebuild_from_descriptors`.
+
+Cross-component DTOs travel through consumer-side structural Protocol
+cuts living in this module — :class:`TilesByBboxBatchQuery`,
+:class:`TilePixelOpener`, :class:`DescriptorIndexRebuilder`,
+:class:`BatcherTile` — so the AZ-270 lint
+(``test_az270_compose_root.test_ac6``) stays green: this module never
+imports ``components.c6_tile_cache`` directly. The composition root
+adapts the real C6 surface inside
+``runtime_root.c10_factory.build_descriptor_batcher``.
+
+Design constraints baked in by the spec:
+
+- Halve-and-retry on CUDA OOM is bounded by ``max_oom_retries``; default
+  1 (so 64 → 32 → fail-fast).
+- ``rebuild_from_descriptors`` is the ONLY write path to the ``.index``
+  file (no raw ``numpy.tofile`` — AZ-306 owns atomicity).
+- The int64 id formula is canonical via AZ-306 (the C6 helper is
+  re-imported through the composition-root adapter so this module never
+  reaches across the AZ-270 boundary).
+- ``embed_batch`` is called with mmap-backed handles, not raw bytes —
+  preserves AZ-303's read-only invariant on tile pixels.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Callable
+from contextlib import ExitStack
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Protocol, runtime_checkable
+
+import numpy as np
+
+from gps_denied_onboard.clock import Clock
+from gps_denied_onboard.components.c10_provisioning.errors import (
+    DescriptorBatchError,
+)
+from gps_denied_onboard.components.c10_provisioning.interface import (
+    BackboneEmbedder,
+)
+
+__all__ = [
+    "BatcherTile",
+    "C10BatcherConfig",
+    "CorpusFilter",
+    "DescriptorBatchReport",
+    "DescriptorBatcher",
+    "DescriptorIndexRebuilder",
+    "ProgressEvent",
+    "TileBboxRecord",
+    "TilePixelOpener",
+    "TilesByBboxBatchQuery",
+]
+
+
+_LOG_KIND_PREFIX = "c10.descriptor"
+_PROGRESS_LOG_FRACTION = 10  # emit one DEBUG log per ~10% of tiles
+_PROGRESS_CALLBACK_FRACTION = 10  # callback fires every 10%
+
+
+class BatcherOutcome(str, Enum):
+    """Terminal classification of a :class:`DescriptorBatcher` run."""
+
+    SUCCESS = "success"
+    FAILURE = "failure"
+
+
+@dataclass(frozen=True)
+class TileBboxRecord:
+    """Consumer-side DTO returned by :class:`TilesByBboxBatchQuery`.
+
+    The composition-root adapter walks C6's ``TileMetadata`` rows and
+    emits one of these per tile so this module never imports the C6
+    DTO directly.
+    """
+
+    zoom: int
+    lat: float
+    lon: float
+    source: str
+
+
+@dataclass(frozen=True)
+class CorpusFilter:
+    """Filter set passed into :meth:`DescriptorBatcher.populate_descriptors`.
+
+    Mirrors the spec's three-axis filter: spatial (``bbox`` —
+    ``(min_lat, min_lon, max_lat, max_lon)``), zoom (a tuple so the
+    operator can ask for multiple levels in one go), and sector
+    (``"active_conflict"`` or ``"stable_rear"`` — string form keeps this
+    DTO free of the C6 enum import).
+    """
+
+    bbox: tuple[float, float, float, float]
+    zoom_levels: tuple[int, ...]
+    sector_class: str
+
+
+@dataclass(frozen=True)
+class ProgressEvent:
+    """One progress update emitted to ``C10BatcherConfig.progress_callback``."""
+
+    tiles_done: int
+    tiles_total: int
+    current_batch_size: int
+    elapsed_s: float
+
+
+@dataclass(frozen=True)
+class DescriptorBatchReport:
+    """Terminal report returned by :meth:`populate_descriptors`."""
+
+    descriptors_generated: int
+    tiles_consumed: int
+    oom_retries: int
+    elapsed_s: float
+    outcome: BatcherOutcome
+    failure_reason: str | None = None
+
+
+@dataclass(frozen=True)
+class C10BatcherConfig:
+    """Per-instance batcher policy.
+
+    ``initial_batch_size`` defaults to 64 (the spec's worked example);
+    operators on smaller GPUs dial it down via YAML.
+
+    ``max_oom_retries`` bounds halve-and-retry; default 1 keeps the
+    failure surface at "64 → 32 → fail" so a real GPU regression is
+    visible after ~5-10 s rather than a multi-minute timeout.
+
+    ``progress_callback`` is optional; the batcher always emits DEBUG
+    logs at the same cadence so operators staring at a CLI see
+    progress without enabling tracing.
+    """
+
+    initial_batch_size: int = 64
+    max_oom_retries: int = 1
+    progress_callback: Callable[[ProgressEvent], None] | None = None
+    hnsw_m: int = 32
+    hnsw_ef_construction: int = 200
+    hnsw_ef_search: int = 64
+    hnsw_metric: str = "L2"
+
+    def __post_init__(self) -> None:
+        if self.initial_batch_size <= 0:
+            raise ValueError(
+                "C10BatcherConfig.initial_batch_size must be > 0; "
+                f"got {self.initial_batch_size}"
+            )
+        if self.max_oom_retries < 0:
+            raise ValueError(
+                "C10BatcherConfig.max_oom_retries must be >= 0; "
+                f"got {self.max_oom_retries}"
+            )
+
+
+@runtime_checkable
+class TilesByBboxBatchQuery(Protocol):
+    """Consumer-side cut over C6's ``TileMetadataStore.query_by_bbox``.
+
+    AZ-322 needs the spatial+sector filter applied across multiple
+    zoom levels in one logical call; the composition-root adapter
+    (``runtime_root.c10_factory.build_descriptor_batcher``) loops over
+    ``zoom_levels`` and yields one :class:`TileBboxRecord` per
+    matching row. Returns the concatenated list in deterministic order
+    so the int64 id mapping is stable across runs.
+    """
+
+    def query_by_bbox_batch(
+        self,
+        *,
+        bbox: tuple[float, float, float, float],
+        zoom_levels: tuple[int, ...],
+        sector_class: str,
+    ) -> list[TileBboxRecord]: ...
+
+
+@dataclass(frozen=True)
+class BatcherTile:
+    """Bundles a tile's identity with its mmap handle for one embed call."""
+
+    record: TileBboxRecord
+    pixel_handle: Any  # ``TilePixelHandle`` from the C6 ABC; opaque here.
+
+
+@runtime_checkable
+class TilePixelOpener(Protocol):
+    """Consumer-side cut over C6's ``TileStore.read_tile_pixels``.
+
+    Returns the C6 ``TilePixelHandle`` ABC (a context manager). The
+    composition-root adapter wraps the real C6 ``TileStore`` and
+    re-creates a ``TileId`` from the ``(zoom, lat, lon)`` triple before
+    delegating.
+    """
+
+    def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any: ...
+
+
+@runtime_checkable
+class DescriptorIndexRebuilder(Protocol):
+    """Consumer-side cut over AZ-303 / AZ-306 ``rebuild_from_descriptors``.
+
+    The composition root passes the real
+    :class:`gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index.FaissDescriptorIndex`
+    instance — it structurally satisfies this Protocol. Tests pass a
+    spy.
+
+    AZ-322 hands ``tile_records`` (one per row of ``descriptors``)
+    rather than pre-computed int64 ids: AZ-306 owns the canonical
+    ``tile_id_to_int64`` mapping (same rule as AC-6); we don't
+    duplicate the formula here. The composition-root adapter
+    constructs the C6 ``TileId`` instances + supplies them to
+    ``rebuild_from_descriptors`` (which itself runs the int64
+    derivation per AZ-306).
+    """
+
+    def rebuild(
+        self,
+        *,
+        descriptors: np.ndarray,
+        tile_records: list[TileBboxRecord],
+        hnsw_m: int,
+        hnsw_ef_construction: int,
+        hnsw_ef_search: int,
+        hnsw_metric: str,
+    ) -> None: ...
+
+
+class DescriptorBatcher:
+    """Pre-flight descriptor-batched generation + FAISS rebuild.
+
+    Single public method: :meth:`populate_descriptors`. Construction
+    is dependency-injection only; the composition root supplies all
+    four collaborators and the config block.
+
+    Not re-entrant — concurrent calls on the same instance break the
+    progress accounting.
+    """
+
+    def __init__(
+        self,
+        *,
+        backbone_embedder: BackboneEmbedder,
+        tiles_query: TilesByBboxBatchQuery,
+        tile_pixel_opener: TilePixelOpener,
+        descriptor_index: DescriptorIndexRebuilder,
+        clock: Clock,
+        logger: logging.Logger,
+        config: C10BatcherConfig,
+    ) -> None:
+        self._embedder = backbone_embedder
+        self._tiles_query = tiles_query
+        self._tile_opener = tile_pixel_opener
+        self._descriptor_index = descriptor_index
+        self._clock = clock
+        self._logger = logger
+        self._config = config
+
+    # ------------------------------------------------------------------
+    # Public surface
+    # ------------------------------------------------------------------
+
+    def populate_descriptors(self, corpus_filter: CorpusFilter) -> DescriptorBatchReport:
+        run_started_ns = self._clock.monotonic_ns()
+
+        records = self._tiles_query.query_by_bbox_batch(
+            bbox=corpus_filter.bbox,
+            zoom_levels=corpus_filter.zoom_levels,
+            sector_class=corpus_filter.sector_class,
+        )
+        if not records:
+            elapsed_s = self._elapsed_s(run_started_ns)
+            reason = (
+                "no tiles in C6 for the requested scope; run C11 "
+                "TileDownloader first"
+            )
+            self._logger.error(
+                f"{_LOG_KIND_PREFIX}.empty.corpus",
+                extra={
+                    "kind": f"{_LOG_KIND_PREFIX}.empty.corpus",
+                    "bbox": corpus_filter.bbox,
+                    "zoom_levels": corpus_filter.zoom_levels,
+                    "sector_class": corpus_filter.sector_class,
+                    "elapsed_s": elapsed_s,
+                },
+            )
+            return DescriptorBatchReport(
+                descriptors_generated=0,
+                tiles_consumed=0,
+                oom_retries=0,
+                elapsed_s=elapsed_s,
+                outcome=BatcherOutcome.FAILURE,
+                failure_reason=reason,
+            )
+
+        descriptor_dim = int(self._embedder.descriptor_dim())
+        if descriptor_dim <= 0:
+            raise DescriptorBatchError(
+                "DescriptorBatcher.populate_descriptors: backbone_embedder."
+                f"descriptor_dim() must be > 0; got {descriptor_dim}"
+            )
+
+        total = len(records)
+        self._logger.info(
+            f"{_LOG_KIND_PREFIX}.session.start",
+            extra={
+                "kind": f"{_LOG_KIND_PREFIX}.session.start",
+                "tiles_total": total,
+                "initial_batch_size": self._config.initial_batch_size,
+                "descriptor_dim": descriptor_dim,
+                "bbox": corpus_filter.bbox,
+                "zoom_levels": corpus_filter.zoom_levels,
+                "sector_class": corpus_filter.sector_class,
+            },
+        )
+
+        descriptor_buffer: list[np.ndarray] = []
+        consumed_records: list[TileBboxRecord] = []
+        current_batch_size = self._config.initial_batch_size
+        oom_retries = 0
+        last_progress_emit = 0
+
+        idx = 0
+        while idx < total:
+            window = records[idx : idx + current_batch_size]
+            try:
+                batch_descriptors = self._embed_one_window(window, descriptor_dim)
+            except DescriptorBatchError as exc:
+                if (
+                    self._is_oom(exc)
+                    and oom_retries < self._config.max_oom_retries
+                    and current_batch_size > 1
+                ):
+                    new_size = max(1, current_batch_size // 2)
+                    self._logger.warning(
+                        f"{_LOG_KIND_PREFIX}.oom.retry",
+                        extra={
+                            "kind": f"{_LOG_KIND_PREFIX}.oom.retry",
+                            "previous_batch_size": current_batch_size,
+                            "new_batch_size": new_size,
+                            "tiles_done": idx,
+                            "tiles_total": total,
+                            "oom_retries_after": oom_retries + 1,
+                        },
+                    )
+                    current_batch_size = new_size
+                    oom_retries += 1
+                    continue
+                self._logger.error(
+                    f"{_LOG_KIND_PREFIX}.oom.terminal",
+                    extra={
+                        "kind": f"{_LOG_KIND_PREFIX}.oom.terminal",
+                        "batch_size": current_batch_size,
+                        "tiles_done": idx,
+                        "tiles_total": total,
+                        "oom_retries": oom_retries,
+                        "first_tile_in_batch": (
+                            window[0].zoom,
+                            window[0].lat,
+                            window[0].lon,
+                        ),
+                    },
+                )
+                raise
+
+            descriptor_buffer.append(batch_descriptors)
+            consumed_records.extend(window)
+            idx += len(window)
+
+            elapsed_s = self._elapsed_s(run_started_ns)
+            self._maybe_emit_progress(
+                tiles_done=idx,
+                tiles_total=total,
+                current_batch_size=current_batch_size,
+                elapsed_s=elapsed_s,
+                last_emit_ref=last_progress_emit,
+            )
+            last_progress_emit = (idx * _PROGRESS_CALLBACK_FRACTION) // total
+
+        descriptors = np.concatenate(descriptor_buffer, axis=0)
+        if descriptors.shape != (total, descriptor_dim):
+            raise DescriptorBatchError(
+                "DescriptorBatcher.populate_descriptors: descriptor matrix has "
+                f"shape {descriptors.shape}, expected ({total}, {descriptor_dim})"
+            )
+
+        self._descriptor_index.rebuild(
+            descriptors=np.ascontiguousarray(descriptors, dtype=np.float32),
+            tile_records=consumed_records,
+            hnsw_m=self._config.hnsw_m,
+            hnsw_ef_construction=self._config.hnsw_ef_construction,
+            hnsw_ef_search=self._config.hnsw_ef_search,
+            hnsw_metric=self._config.hnsw_metric,
+        )
+
+        elapsed_s = self._elapsed_s(run_started_ns)
+        self._logger.info(
+            f"{_LOG_KIND_PREFIX}.session.complete",
+            extra={
+                "kind": f"{_LOG_KIND_PREFIX}.session.complete",
+                "tiles_total": total,
+                "descriptors_generated": total,
+                "oom_retries": oom_retries,
+                "elapsed_s": elapsed_s,
+            },
+        )
+        return DescriptorBatchReport(
+            descriptors_generated=total,
+            tiles_consumed=total,
+            oom_retries=oom_retries,
+            elapsed_s=elapsed_s,
+            outcome=BatcherOutcome.SUCCESS,
+            failure_reason=None,
+        )
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    def _embed_one_window(
+        self, window: list[TileBboxRecord], descriptor_dim: int
+    ) -> np.ndarray:
+        with ExitStack() as stack:
+            handles: list[Any] = []
+            for record in window:
+                handle = self._tile_opener.open_tile(
+                    zoom=record.zoom, lat=record.lat, lon=record.lon
+                )
+                # The C6 ``TilePixelHandle`` ABC is a context manager
+                # whose __exit__ unmaps the file. ExitStack guarantees
+                # release even if embed_batch raises mid-flight.
+                stack.enter_context(handle)
+                handles.append(handle)
+            batch_descriptors = self._embedder.embed_batch(handles)
+
+        return self._validate_batch_shape(
+            batch_descriptors, expected_rows=len(window), descriptor_dim=descriptor_dim
+        )
+
+    @staticmethod
+    def _validate_batch_shape(
+        batch_descriptors: object,
+        *,
+        expected_rows: int,
+        descriptor_dim: int,
+    ) -> np.ndarray:
+        if not isinstance(batch_descriptors, np.ndarray):
+            raise DescriptorBatchError(
+                "DescriptorBatcher: embed_batch must return numpy.ndarray; "
+                f"got {type(batch_descriptors).__name__}"
+            )
+        if batch_descriptors.ndim != 2:
+            raise DescriptorBatchError(
+                "DescriptorBatcher: embed_batch must return a 2-D array; "
+                f"got ndim={batch_descriptors.ndim}"
+            )
+        if batch_descriptors.shape[0] != expected_rows:
+            raise DescriptorBatchError(
+                "DescriptorBatcher: embed_batch returned "
+                f"{batch_descriptors.shape[0]} rows; expected {expected_rows}"
+            )
+        if batch_descriptors.shape[1] != descriptor_dim:
+            raise DescriptorBatchError(
+                "DescriptorBatcher: descriptor_dim mismatch — embed_batch "
+                f"returned shape[1]={batch_descriptors.shape[1]}, but "
+                f"descriptor_dim()={descriptor_dim}"
+            )
+        if batch_descriptors.dtype != np.float32:
+            return batch_descriptors.astype(np.float32, copy=False)
+        return batch_descriptors
+
+    @staticmethod
+    def _is_oom(exc: DescriptorBatchError) -> bool:
+        # Spec AC-2 distinguishes OOM by message substring; no separate
+        # exception subclass — keeps the impl free of CUDA-specific imports.
+        return "CUDA OOM" in str(exc)
+
+    def _elapsed_s(self, run_started_ns: int) -> float:
+        return max(0.0, (self._clock.monotonic_ns() - run_started_ns) / 1e9)
+
+    def _maybe_emit_progress(
+        self,
+        *,
+        tiles_done: int,
+        tiles_total: int,
+        current_batch_size: int,
+        elapsed_s: float,
+        last_emit_ref: int,
+    ) -> None:
+        # Fire at every 10% boundary crossed since last emit (so the
+        # callback receives exactly 10 events for 1000 tiles, even when
+        # batch sizes don't divide evenly into 100 tiles per step).
+        current_decile = (tiles_done * _PROGRESS_CALLBACK_FRACTION) // tiles_total
+        for decile in range(last_emit_ref + 1, current_decile + 1):
+            tiles_at_decile = (decile * tiles_total) // _PROGRESS_CALLBACK_FRACTION
+            event = ProgressEvent(
+                tiles_done=tiles_at_decile,
+                tiles_total=tiles_total,
+                current_batch_size=current_batch_size,
+                elapsed_s=elapsed_s,
+            )
+            self._logger.debug(
+                f"{_LOG_KIND_PREFIX}.progress",
+                extra={
+                    "kind": f"{_LOG_KIND_PREFIX}.progress",
+                    "tiles_done": event.tiles_done,
+                    "tiles_total": event.tiles_total,
+                    "current_batch_size": event.current_batch_size,
+                    "elapsed_s": event.elapsed_s,
+                },
+            )
+            if self._config.progress_callback is not None:
+                self._config.progress_callback(event)
@@ -12,6 +12,7 @@ from __future__ import annotations

 __all__ = [
    "C10ProvisioningError",
+    "DescriptorBatchError",
    "ManifestWriteError",
 ]

@@ -20,6 +21,26 @@ class C10ProvisioningError(Exception):
    """Base class for the C10 cache-provisioning error family."""


+class DescriptorBatchError(C10ProvisioningError):
+    """``DescriptorBatcher.populate_descriptors`` could not finish (AZ-322).
+
+    Surfaces three failure modes:
+
+    1. ``"CUDA OOM"`` raised by the injected
+       :class:`gps_denied_onboard.components.c10_provisioning.interface.BackboneEmbedder`;
+       the batcher catches the OOM-flavoured instance and triggers the
+       halve-and-retry loop (AC-2). Persistent OOM after retries are
+       exhausted re-raises with the final batch size + tile-id
+       context (AC-3).
+    2. ``descriptor_dim`` mismatch — the impl returned a column count
+       that does not equal :meth:`BackboneEmbedder.descriptor_dim`
+       (AC-9); raised BEFORE the FAISS rebuild call so an existing
+       valid index is not corrupted.
+    3. Underlying FAISS rebuild failure (rewrapped from the AZ-306
+       :class:`IndexBuildError` envelope).
+    """
+
+
 class ManifestWriteError(C10ProvisioningError):
    """``ManifestBuilder.build_manifest`` could not produce a signed Manifest.

@@ -3,6 +3,11 @@
 - :class:`CacheProvisioner` (AZ-325, pending) — pre-flight orchestrator.
 - :class:`ManifestSigner` (AZ-323) — Ed25519 detached signing surface
  consumed by :class:`ManifestBuilder`.
+- :class:`BackboneEmbedder` (AZ-322) — image-batch → descriptor surface
+  consumed by :class:`DescriptorBatcher`. The default impl wraps the
+  AZ-298 / AZ-299 / AZ-300 ``InferenceRuntime``-produced engine; when
+  E-C2 (AZ-336+) ships its public embed surface a thin adapter swaps
+  the impl in via the composition root.

 Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
 `_docs/02_document/components/11_c10_provisioning/`.
@@ -11,11 +16,15 @@ Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
 from __future__ import annotations

 from pathlib import Path
-from typing import Protocol, runtime_checkable
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable

 from gps_denied_onboard._types.manifests import Manifest

+if TYPE_CHECKING:
+    import numpy as np
+
 __all__ = [
+    "BackboneEmbedder",
    "CacheProvisioner",
    "ManifestSigner",
    "SigningKeyHandle",
@@ -66,3 +75,28 @@ class ManifestSigner(Protocol):
    def sign(self, key: SigningKeyHandle, payload_bytes: bytes) -> bytes: ...

    def public_key_fingerprint(self, key: SigningKeyHandle) -> str: ...
+
+
+@runtime_checkable
+class BackboneEmbedder(Protocol):
+    """Image-batch → descriptor matrix surface (AZ-322).
+
+    Two-method contract:
+
+    - :meth:`embed_batch` takes a list of mmap-backed tile pixel
+      handles (any object exposing the c6 ``TilePixelHandle`` ABC) and
+      returns an ``np.ndarray`` of shape ``(len(tiles),
+      descriptor_dim())`` with ``dtype == float32``.
+    - :meth:`descriptor_dim` returns the fixed descriptor dimension
+      the impl produces; queried once before the first batch and used
+      to validate every batch's last axis (AC-9).
+
+    On CUDA OOM the impl raises
+    :class:`gps_denied_onboard.components.c10_provisioning.errors.DescriptorBatchError`
+    with ``"CUDA OOM"`` in the message — the batcher catches this
+    distinguishable subtype and triggers halve-and-retry (AC-2).
+    """
+
+    def embed_batch(self, tiles: list[Any]) -> np.ndarray: ...
+
+    def descriptor_dim(self) -> int: ...