[AZ-322] C10 DescriptorBatcher (faiss-cpu, OOM halve-retry)

Implements the C10 internal phase that walks every C6 tile, embeds
through C2's backbone via the AZ-321-produced engine, and rebuilds
the AZ-306 FAISS HNSW index in one atomic write.

- DescriptorBatcher with halve-and-retry OOM recovery (default 1 retry)
- BackboneEmbedder Protocol + C7EngineBackboneEmbedder default impl
- DescriptorBatchError for OOM / dim-mismatch / missing-output failures
- Empty-corpus surfaces as outcome=failure with explicit hint to run C11
- Per-10% progress callback + DEBUG logs (no engine bytes leaked)
- Consumer-side Protocol cuts (TilesByBboxBatchQuery, TilePixelOpener,
  DescriptorIndexRebuilder) so c10 stays within AZ-270 lint
- runtime_root.c10_factory adds build_descriptor_batcher + three
  C6->C10 adapters
- 16 unit tests covering AC-1..AC-10 + 2 NFRs + 4 supplemental
  (Protocol conformance, query pass-through, handle release, config)

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-13 04:20:47 +03:00
parent 3b7265757b
commit f01a5058ab
12 changed files with 1733 additions and 10 deletions
@@ -11,12 +11,27 @@ them through this single contract surface.
from gps_denied_onboard._types.inference import EngineCacheEntry
from gps_denied_onboard._types.manifests import Manifest
from gps_denied_onboard.components.c10_provisioning.c7_engine_embedder import (
C7EngineBackboneEmbedder,
)
from gps_denied_onboard.components.c10_provisioning.config import (
BackboneConfig,
C10ManifestConfig,
C10ProvisioningConfig,
SigningMode,
)
from gps_denied_onboard.components.c10_provisioning.descriptor_batcher import (
BatcherTile,
C10BatcherConfig,
CorpusFilter,
DescriptorBatcher,
DescriptorBatchReport,
DescriptorIndexRebuilder,
ProgressEvent,
TileBboxRecord,
TilePixelOpener,
TilesByBboxBatchQuery,
)
from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
BackboneSpec,
CompileEngineCallable,
@@ -28,9 +43,11 @@ from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
)
from gps_denied_onboard.components.c10_provisioning.errors import (
C10ProvisioningError,
DescriptorBatchError,
ManifestWriteError,
)
from gps_denied_onboard.components.c10_provisioning.interface import (
BackboneEmbedder,
CacheProvisioner,
ManifestSigner,
SigningKeyHandle,
@@ -60,13 +77,22 @@ __all__ = [
"VALID_SECTOR_CLASSES",
"ArtifactCheck",
"BackboneConfig",
"BackboneEmbedder",
"BackboneSpec",
"BatcherTile",
"C7EngineBackboneEmbedder",
"C10BatcherConfig",
"C10ManifestConfig",
"C10ProvisioningConfig",
"C10ProvisioningError",
"CacheProvisioner",
"CompileEngineCallable",
"CompileOutcome",
"CorpusFilter",
"DescriptorBatchError",
"DescriptorBatchReport",
"DescriptorBatcher",
"DescriptorIndexRebuilder",
"Ed25519ManifestSigner",
"EngineCacheEntry",
"EngineCompileRequest",
@@ -81,9 +107,13 @@ __all__ = [
"ManifestVerifier",
"ManifestVerifierImpl",
"ManifestWriteError",
"ProgressEvent",
"SigningKeyHandle",
"SigningMode",
"TileBboxRecord",
"TileHashRecord",
"TilePixelOpener",
"TilesByBboxBatchQuery",
"TilesByBboxQuery",
"VerificationResult",
"VerifyFailReason",
@@ -0,0 +1,150 @@
"""``C7EngineBackboneEmbedder`` (AZ-322).
Default :class:`BackboneEmbedder` implementation: wraps an AZ-321-produced
engine + an AZ-297 :class:`InferenceRuntime` and turns
``list[TilePixelHandle]`` into ``np.ndarray`` of shape
``(batch_size, descriptor_dim)``.
JPEG → tensor preprocessing is **not** owned here — it is the
:class:`BackboneSpec` consumer's responsibility (the C2 VPR backbone in
AZ-255 will eventually own its own normalization). Until E-C2 ships,
the composition root injects a ``tile_decoder`` callable so this
adapter stays free of OpenCV / Pillow / torchvision imports and the
test surface stays narrow. Risk-1 in the AZ-322 spec mitigation.
"""
from __future__ import annotations
import logging
from collections.abc import Callable
from typing import Any
import numpy as np
from gps_denied_onboard.components.c10_provisioning.errors import (
DescriptorBatchError,
)
# AZ-322: ``InferenceRuntime`` (c7) and ``EngineHandle`` (_types) are
# REFERENCED only in annotations. Importing them at runtime — even
# under ``TYPE_CHECKING`` — would cross the AZ-270 component boundary
# (the AST lint flags TYPE_CHECKING imports too, conservatively). We
# instead duck-type these via ``Any`` and rely on structural calls
# (``inference_runtime.infer(handle, dict) -> dict``); the
# composition root (``runtime_root.c10_factory``) wires the concrete
# c7 instance in.
__all__ = ["C7EngineBackboneEmbedder"]
_OOM_MARKERS = ("CUDA out of memory", "OutOfMemoryError", "OOM")
class C7EngineBackboneEmbedder:
"""Thin adapter from AZ-297's :class:`InferenceRuntime` to
:class:`BackboneEmbedder`.
Construction owns one :class:`EngineHandle` for the lifetime of
the embedder (one batcher session). ``embed_batch`` decodes the
incoming tile handles via the injected ``tile_decoder`` callable,
stacks them into a batch tensor, and runs ``infer`` once.
The output tensor is read from ``outputs[output_name]`` —
exposing ``output_name`` keeps the adapter portable across
backbones (DINOv2-VPR uses ``"descriptor"``, others may differ).
OOM rewrap: any :class:`gps_denied_onboard.components.c7_inference.errors.OutOfMemoryError`
OR an exception whose ``str`` contains an OOM marker is rewrapped
as :class:`DescriptorBatchError("CUDA OOM at batch_size=N")` so
:class:`DescriptorBatcher`'s halve-and-retry catches it (AC-2).
"""
def __init__(
self,
*,
inference_runtime: Any,
engine_handle: Any,
input_name: str,
output_name: str,
descriptor_dim: int,
tile_decoder: Callable[[Any], np.ndarray],
logger: logging.Logger,
) -> None:
if descriptor_dim <= 0:
raise ValueError(
f"descriptor_dim must be positive; got {descriptor_dim}"
)
self._runtime = inference_runtime
self._handle = engine_handle
self._input_name = input_name
self._output_name = output_name
self._descriptor_dim = descriptor_dim
self._tile_decoder = tile_decoder
self._logger = logger
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
if not tiles:
return np.empty((0, self._descriptor_dim), dtype=np.float32)
batch = self._stack_batch(tiles)
try:
outputs = self._runtime.infer(
self._handle, {self._input_name: batch}
)
except Exception as exc: # rewrap OOM; surface everything else
if _looks_like_oom(exc):
raise DescriptorBatchError(
f"CUDA OOM at batch_size={len(tiles)}"
) from exc
raise
if self._output_name not in outputs:
raise DescriptorBatchError(
f"engine output dict missing key {self._output_name!r}; "
f"available keys = {list(outputs.keys())}"
)
descriptors = outputs[self._output_name]
if not isinstance(descriptors, np.ndarray):
raise DescriptorBatchError(
f"engine output {self._output_name!r} is not an ndarray; "
f"got {type(descriptors).__name__}"
)
if descriptors.ndim != 2 or descriptors.shape[0] != len(tiles):
raise DescriptorBatchError(
f"engine output shape {descriptors.shape} does not match "
f"expected (batch={len(tiles)}, dim={self._descriptor_dim})"
)
if descriptors.dtype != np.float32:
descriptors = descriptors.astype(np.float32, copy=False)
return descriptors
def descriptor_dim(self) -> int:
return self._descriptor_dim
def _stack_batch(self, tiles: list[Any]) -> np.ndarray:
decoded = [self._tile_decoder(handle) for handle in tiles]
if not decoded:
return np.empty((0,), dtype=np.float32)
first_shape = decoded[0].shape
for i, arr in enumerate(decoded[1:], start=1):
if arr.shape != first_shape:
raise DescriptorBatchError(
f"tile_decoder returned shape mismatch at index {i}: "
f"{arr.shape} vs first {first_shape}"
)
return np.stack(decoded, axis=0)
def _looks_like_oom(exc: BaseException) -> bool:
"""Detect OOM by exception type name OR message marker.
The C7 contract names the canonical exception
:class:`OutOfMemoryError`; back-end SDKs occasionally raise raw
:class:`RuntimeError` with a message describing OOM (PyTorch
historically does this). We accept both so the AC-2 retry loop
catches the failure regardless of the underlying SDK.
"""
if type(exc).__name__ == "OutOfMemoryError":
return True
message = str(exc)
return any(marker in message for marker in _OOM_MARKERS)
@@ -0,0 +1,522 @@
"""C10 ``DescriptorBatcher`` — embed corpus + rebuild FAISS index (AZ-322).
The pre-flight phase that walks every C6 tile in
``(bbox, zoom_levels)``, runs them through the C2 backbone (via the
AZ-321-produced engine) in batches sized for the operator workstation,
and rebuilds the FAISS HNSW index via AZ-303 / AZ-306's
:meth:`DescriptorIndex.rebuild_from_descriptors`.
Cross-component DTOs travel through consumer-side structural Protocol
cuts living in this module — :class:`TilesByBboxBatchQuery`,
:class:`TilePixelOpener`, :class:`DescriptorIndexRebuilder`,
:class:`BatcherTile` — so the AZ-270 lint
(``test_az270_compose_root.test_ac6``) stays green: this module never
imports ``components.c6_tile_cache`` directly. The composition root
adapts the real C6 surface inside
``runtime_root.c10_factory.build_descriptor_batcher``.
Design constraints baked in by the spec:
- Halve-and-retry on CUDA OOM is bounded by ``max_oom_retries``; default
1 (so 64 → 32 → fail-fast).
- ``rebuild_from_descriptors`` is the ONLY write path to the ``.index``
file (no raw ``numpy.tofile`` — AZ-306 owns atomicity).
- The int64 id formula is canonical via AZ-306 (the C6 helper is
re-imported through the composition-root adapter so this module never
reaches across the AZ-270 boundary).
- ``embed_batch`` is called with mmap-backed handles, not raw bytes —
preserves AZ-303's read-only invariant on tile pixels.
"""
from __future__ import annotations
import logging
from collections.abc import Callable
from contextlib import ExitStack
from dataclasses import dataclass
from enum import Enum
from typing import Any, Protocol, runtime_checkable
import numpy as np
from gps_denied_onboard.clock import Clock
from gps_denied_onboard.components.c10_provisioning.errors import (
DescriptorBatchError,
)
from gps_denied_onboard.components.c10_provisioning.interface import (
BackboneEmbedder,
)
__all__ = [
"BatcherTile",
"C10BatcherConfig",
"CorpusFilter",
"DescriptorBatchReport",
"DescriptorBatcher",
"DescriptorIndexRebuilder",
"ProgressEvent",
"TileBboxRecord",
"TilePixelOpener",
"TilesByBboxBatchQuery",
]
_LOG_KIND_PREFIX = "c10.descriptor"
_PROGRESS_LOG_FRACTION = 10 # emit one DEBUG log per ~10% of tiles
_PROGRESS_CALLBACK_FRACTION = 10 # callback fires every 10%
class BatcherOutcome(str, Enum):
"""Terminal classification of a :class:`DescriptorBatcher` run."""
SUCCESS = "success"
FAILURE = "failure"
@dataclass(frozen=True)
class TileBboxRecord:
"""Consumer-side DTO returned by :class:`TilesByBboxBatchQuery`.
The composition-root adapter walks C6's ``TileMetadata`` rows and
emits one of these per tile so this module never imports the C6
DTO directly.
"""
zoom: int
lat: float
lon: float
source: str
@dataclass(frozen=True)
class CorpusFilter:
"""Filter set passed into :meth:`DescriptorBatcher.populate_descriptors`.
Mirrors the spec's three-axis filter: spatial (``bbox`` —
``(min_lat, min_lon, max_lat, max_lon)``), zoom (a tuple so the
operator can ask for multiple levels in one go), and sector
(``"active_conflict"`` or ``"stable_rear"`` — string form keeps this
DTO free of the C6 enum import).
"""
bbox: tuple[float, float, float, float]
zoom_levels: tuple[int, ...]
sector_class: str
@dataclass(frozen=True)
class ProgressEvent:
"""One progress update emitted to ``C10BatcherConfig.progress_callback``."""
tiles_done: int
tiles_total: int
current_batch_size: int
elapsed_s: float
@dataclass(frozen=True)
class DescriptorBatchReport:
"""Terminal report returned by :meth:`populate_descriptors`."""
descriptors_generated: int
tiles_consumed: int
oom_retries: int
elapsed_s: float
outcome: BatcherOutcome
failure_reason: str | None = None
@dataclass(frozen=True)
class C10BatcherConfig:
"""Per-instance batcher policy.
``initial_batch_size`` defaults to 64 (the spec's worked example);
operators on smaller GPUs dial it down via YAML.
``max_oom_retries`` bounds halve-and-retry; default 1 keeps the
failure surface at "64 → 32 → fail" so a real GPU regression is
visible after ~5-10 s rather than a multi-minute timeout.
``progress_callback`` is optional; the batcher always emits DEBUG
logs at the same cadence so operators staring at a CLI see
progress without enabling tracing.
"""
initial_batch_size: int = 64
max_oom_retries: int = 1
progress_callback: Callable[[ProgressEvent], None] | None = None
hnsw_m: int = 32
hnsw_ef_construction: int = 200
hnsw_ef_search: int = 64
hnsw_metric: str = "L2"
def __post_init__(self) -> None:
if self.initial_batch_size <= 0:
raise ValueError(
"C10BatcherConfig.initial_batch_size must be > 0; "
f"got {self.initial_batch_size}"
)
if self.max_oom_retries < 0:
raise ValueError(
"C10BatcherConfig.max_oom_retries must be >= 0; "
f"got {self.max_oom_retries}"
)
@runtime_checkable
class TilesByBboxBatchQuery(Protocol):
"""Consumer-side cut over C6's ``TileMetadataStore.query_by_bbox``.
AZ-322 needs the spatial+sector filter applied across multiple
zoom levels in one logical call; the composition-root adapter
(``runtime_root.c10_factory.build_descriptor_batcher``) loops over
``zoom_levels`` and yields one :class:`TileBboxRecord` per
matching row. Returns the concatenated list in deterministic order
so the int64 id mapping is stable across runs.
"""
def query_by_bbox_batch(
self,
*,
bbox: tuple[float, float, float, float],
zoom_levels: tuple[int, ...],
sector_class: str,
) -> list[TileBboxRecord]: ...
@dataclass(frozen=True)
class BatcherTile:
"""Bundles a tile's identity with its mmap handle for one embed call."""
record: TileBboxRecord
pixel_handle: Any # ``TilePixelHandle`` from the C6 ABC; opaque here.
@runtime_checkable
class TilePixelOpener(Protocol):
"""Consumer-side cut over C6's ``TileStore.read_tile_pixels``.
Returns the C6 ``TilePixelHandle`` ABC (a context manager). The
composition-root adapter wraps the real C6 ``TileStore`` and
re-creates a ``TileId`` from the ``(zoom, lat, lon)`` triple before
delegating.
"""
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any: ...
@runtime_checkable
class DescriptorIndexRebuilder(Protocol):
"""Consumer-side cut over AZ-303 / AZ-306 ``rebuild_from_descriptors``.
The composition root passes the real
:class:`gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index.FaissDescriptorIndex`
instance — it structurally satisfies this Protocol. Tests pass a
spy.
AZ-322 hands ``tile_records`` (one per row of ``descriptors``)
rather than pre-computed int64 ids: AZ-306 owns the canonical
``tile_id_to_int64`` mapping (same rule as AC-6); we don't
duplicate the formula here. The composition-root adapter
constructs the C6 ``TileId`` instances + supplies them to
``rebuild_from_descriptors`` (which itself runs the int64
derivation per AZ-306).
"""
def rebuild(
self,
*,
descriptors: np.ndarray,
tile_records: list[TileBboxRecord],
hnsw_m: int,
hnsw_ef_construction: int,
hnsw_ef_search: int,
hnsw_metric: str,
) -> None: ...
class DescriptorBatcher:
"""Pre-flight descriptor-batched generation + FAISS rebuild.
Single public method: :meth:`populate_descriptors`. Construction
is dependency-injection only; the composition root supplies all
four collaborators and the config block.
Not re-entrant — concurrent calls on the same instance break the
progress accounting.
"""
def __init__(
self,
*,
backbone_embedder: BackboneEmbedder,
tiles_query: TilesByBboxBatchQuery,
tile_pixel_opener: TilePixelOpener,
descriptor_index: DescriptorIndexRebuilder,
clock: Clock,
logger: logging.Logger,
config: C10BatcherConfig,
) -> None:
self._embedder = backbone_embedder
self._tiles_query = tiles_query
self._tile_opener = tile_pixel_opener
self._descriptor_index = descriptor_index
self._clock = clock
self._logger = logger
self._config = config
# ------------------------------------------------------------------
# Public surface
# ------------------------------------------------------------------
def populate_descriptors(self, corpus_filter: CorpusFilter) -> DescriptorBatchReport:
run_started_ns = self._clock.monotonic_ns()
records = self._tiles_query.query_by_bbox_batch(
bbox=corpus_filter.bbox,
zoom_levels=corpus_filter.zoom_levels,
sector_class=corpus_filter.sector_class,
)
if not records:
elapsed_s = self._elapsed_s(run_started_ns)
reason = (
"no tiles in C6 for the requested scope; run C11 "
"TileDownloader first"
)
self._logger.error(
f"{_LOG_KIND_PREFIX}.empty.corpus",
extra={
"kind": f"{_LOG_KIND_PREFIX}.empty.corpus",
"bbox": corpus_filter.bbox,
"zoom_levels": corpus_filter.zoom_levels,
"sector_class": corpus_filter.sector_class,
"elapsed_s": elapsed_s,
},
)
return DescriptorBatchReport(
descriptors_generated=0,
tiles_consumed=0,
oom_retries=0,
elapsed_s=elapsed_s,
outcome=BatcherOutcome.FAILURE,
failure_reason=reason,
)
descriptor_dim = int(self._embedder.descriptor_dim())
if descriptor_dim <= 0:
raise DescriptorBatchError(
"DescriptorBatcher.populate_descriptors: backbone_embedder."
f"descriptor_dim() must be > 0; got {descriptor_dim}"
)
total = len(records)
self._logger.info(
f"{_LOG_KIND_PREFIX}.session.start",
extra={
"kind": f"{_LOG_KIND_PREFIX}.session.start",
"tiles_total": total,
"initial_batch_size": self._config.initial_batch_size,
"descriptor_dim": descriptor_dim,
"bbox": corpus_filter.bbox,
"zoom_levels": corpus_filter.zoom_levels,
"sector_class": corpus_filter.sector_class,
},
)
descriptor_buffer: list[np.ndarray] = []
consumed_records: list[TileBboxRecord] = []
current_batch_size = self._config.initial_batch_size
oom_retries = 0
last_progress_emit = 0
idx = 0
while idx < total:
window = records[idx : idx + current_batch_size]
try:
batch_descriptors = self._embed_one_window(window, descriptor_dim)
except DescriptorBatchError as exc:
if (
self._is_oom(exc)
and oom_retries < self._config.max_oom_retries
and current_batch_size > 1
):
new_size = max(1, current_batch_size // 2)
self._logger.warning(
f"{_LOG_KIND_PREFIX}.oom.retry",
extra={
"kind": f"{_LOG_KIND_PREFIX}.oom.retry",
"previous_batch_size": current_batch_size,
"new_batch_size": new_size,
"tiles_done": idx,
"tiles_total": total,
"oom_retries_after": oom_retries + 1,
},
)
current_batch_size = new_size
oom_retries += 1
continue
self._logger.error(
f"{_LOG_KIND_PREFIX}.oom.terminal",
extra={
"kind": f"{_LOG_KIND_PREFIX}.oom.terminal",
"batch_size": current_batch_size,
"tiles_done": idx,
"tiles_total": total,
"oom_retries": oom_retries,
"first_tile_in_batch": (
window[0].zoom,
window[0].lat,
window[0].lon,
),
},
)
raise
descriptor_buffer.append(batch_descriptors)
consumed_records.extend(window)
idx += len(window)
elapsed_s = self._elapsed_s(run_started_ns)
self._maybe_emit_progress(
tiles_done=idx,
tiles_total=total,
current_batch_size=current_batch_size,
elapsed_s=elapsed_s,
last_emit_ref=last_progress_emit,
)
last_progress_emit = (idx * _PROGRESS_CALLBACK_FRACTION) // total
descriptors = np.concatenate(descriptor_buffer, axis=0)
if descriptors.shape != (total, descriptor_dim):
raise DescriptorBatchError(
"DescriptorBatcher.populate_descriptors: descriptor matrix has "
f"shape {descriptors.shape}, expected ({total}, {descriptor_dim})"
)
self._descriptor_index.rebuild(
descriptors=np.ascontiguousarray(descriptors, dtype=np.float32),
tile_records=consumed_records,
hnsw_m=self._config.hnsw_m,
hnsw_ef_construction=self._config.hnsw_ef_construction,
hnsw_ef_search=self._config.hnsw_ef_search,
hnsw_metric=self._config.hnsw_metric,
)
elapsed_s = self._elapsed_s(run_started_ns)
self._logger.info(
f"{_LOG_KIND_PREFIX}.session.complete",
extra={
"kind": f"{_LOG_KIND_PREFIX}.session.complete",
"tiles_total": total,
"descriptors_generated": total,
"oom_retries": oom_retries,
"elapsed_s": elapsed_s,
},
)
return DescriptorBatchReport(
descriptors_generated=total,
tiles_consumed=total,
oom_retries=oom_retries,
elapsed_s=elapsed_s,
outcome=BatcherOutcome.SUCCESS,
failure_reason=None,
)
# ------------------------------------------------------------------
# Internals
# ------------------------------------------------------------------
def _embed_one_window(
self, window: list[TileBboxRecord], descriptor_dim: int
) -> np.ndarray:
with ExitStack() as stack:
handles: list[Any] = []
for record in window:
handle = self._tile_opener.open_tile(
zoom=record.zoom, lat=record.lat, lon=record.lon
)
# The C6 ``TilePixelHandle`` ABC is a context manager
# whose __exit__ unmaps the file. ExitStack guarantees
# release even if embed_batch raises mid-flight.
stack.enter_context(handle)
handles.append(handle)
batch_descriptors = self._embedder.embed_batch(handles)
return self._validate_batch_shape(
batch_descriptors, expected_rows=len(window), descriptor_dim=descriptor_dim
)
@staticmethod
def _validate_batch_shape(
batch_descriptors: object,
*,
expected_rows: int,
descriptor_dim: int,
) -> np.ndarray:
if not isinstance(batch_descriptors, np.ndarray):
raise DescriptorBatchError(
"DescriptorBatcher: embed_batch must return numpy.ndarray; "
f"got {type(batch_descriptors).__name__}"
)
if batch_descriptors.ndim != 2:
raise DescriptorBatchError(
"DescriptorBatcher: embed_batch must return a 2-D array; "
f"got ndim={batch_descriptors.ndim}"
)
if batch_descriptors.shape[0] != expected_rows:
raise DescriptorBatchError(
"DescriptorBatcher: embed_batch returned "
f"{batch_descriptors.shape[0]} rows; expected {expected_rows}"
)
if batch_descriptors.shape[1] != descriptor_dim:
raise DescriptorBatchError(
"DescriptorBatcher: descriptor_dim mismatch — embed_batch "
f"returned shape[1]={batch_descriptors.shape[1]}, but "
f"descriptor_dim()={descriptor_dim}"
)
if batch_descriptors.dtype != np.float32:
return batch_descriptors.astype(np.float32, copy=False)
return batch_descriptors
@staticmethod
def _is_oom(exc: DescriptorBatchError) -> bool:
# Spec AC-2 distinguishes OOM by message substring; no separate
# exception subclass — keeps the impl free of CUDA-specific imports.
return "CUDA OOM" in str(exc)
def _elapsed_s(self, run_started_ns: int) -> float:
return max(0.0, (self._clock.monotonic_ns() - run_started_ns) / 1e9)
def _maybe_emit_progress(
self,
*,
tiles_done: int,
tiles_total: int,
current_batch_size: int,
elapsed_s: float,
last_emit_ref: int,
) -> None:
# Fire at every 10% boundary crossed since last emit (so the
# callback receives exactly 10 events for 1000 tiles, even when
# batch sizes don't divide evenly into 100 tiles per step).
current_decile = (tiles_done * _PROGRESS_CALLBACK_FRACTION) // tiles_total
for decile in range(last_emit_ref + 1, current_decile + 1):
tiles_at_decile = (decile * tiles_total) // _PROGRESS_CALLBACK_FRACTION
event = ProgressEvent(
tiles_done=tiles_at_decile,
tiles_total=tiles_total,
current_batch_size=current_batch_size,
elapsed_s=elapsed_s,
)
self._logger.debug(
f"{_LOG_KIND_PREFIX}.progress",
extra={
"kind": f"{_LOG_KIND_PREFIX}.progress",
"tiles_done": event.tiles_done,
"tiles_total": event.tiles_total,
"current_batch_size": event.current_batch_size,
"elapsed_s": event.elapsed_s,
},
)
if self._config.progress_callback is not None:
self._config.progress_callback(event)
@@ -12,6 +12,7 @@ from __future__ import annotations
__all__ = [
"C10ProvisioningError",
"DescriptorBatchError",
"ManifestWriteError",
]
@@ -20,6 +21,26 @@ class C10ProvisioningError(Exception):
"""Base class for the C10 cache-provisioning error family."""
class DescriptorBatchError(C10ProvisioningError):
"""``DescriptorBatcher.populate_descriptors`` could not finish (AZ-322).
Surfaces three failure modes:
1. ``"CUDA OOM"`` raised by the injected
:class:`gps_denied_onboard.components.c10_provisioning.interface.BackboneEmbedder`;
the batcher catches the OOM-flavoured instance and triggers the
halve-and-retry loop (AC-2). Persistent OOM after retries are
exhausted re-raises with the final batch size + tile-id
context (AC-3).
2. ``descriptor_dim`` mismatch — the impl returned a column count
that does not equal :meth:`BackboneEmbedder.descriptor_dim`
(AC-9); raised BEFORE the FAISS rebuild call so an existing
valid index is not corrupted.
3. Underlying FAISS rebuild failure (rewrapped from the AZ-306
:class:`IndexBuildError` envelope).
"""
class ManifestWriteError(C10ProvisioningError):
"""``ManifestBuilder.build_manifest`` could not produce a signed Manifest.
@@ -3,6 +3,11 @@
- :class:`CacheProvisioner` (AZ-325, pending) — pre-flight orchestrator.
- :class:`ManifestSigner` (AZ-323) — Ed25519 detached signing surface
consumed by :class:`ManifestBuilder`.
- :class:`BackboneEmbedder` (AZ-322) — image-batch → descriptor surface
consumed by :class:`DescriptorBatcher`. The default impl wraps the
AZ-298 / AZ-299 / AZ-300 ``InferenceRuntime``-produced engine; when
E-C2 (AZ-336+) ships its public embed surface a thin adapter swaps
the impl in via the composition root.
Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
`_docs/02_document/components/11_c10_provisioning/`.
@@ -11,11 +16,15 @@ Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
from __future__ import annotations
from pathlib import Path
from typing import Protocol, runtime_checkable
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
from gps_denied_onboard._types.manifests import Manifest
if TYPE_CHECKING:
import numpy as np
__all__ = [
"BackboneEmbedder",
"CacheProvisioner",
"ManifestSigner",
"SigningKeyHandle",
@@ -66,3 +75,28 @@ class ManifestSigner(Protocol):
def sign(self, key: SigningKeyHandle, payload_bytes: bytes) -> bytes: ...
def public_key_fingerprint(self, key: SigningKeyHandle) -> str: ...
@runtime_checkable
class BackboneEmbedder(Protocol):
"""Image-batch → descriptor matrix surface (AZ-322).
Two-method contract:
- :meth:`embed_batch` takes a list of mmap-backed tile pixel
handles (any object exposing the c6 ``TilePixelHandle`` ABC) and
returns an ``np.ndarray`` of shape ``(len(tiles),
descriptor_dim())`` with ``dtype == float32``.
- :meth:`descriptor_dim` returns the fixed descriptor dimension
the impl produces; queried once before the first batch and used
to validate every batch's last axis (AC-9).
On CUDA OOM the impl raises
:class:`gps_denied_onboard.components.c10_provisioning.errors.DescriptorBatchError`
with ``"CUDA OOM"`` in the message — the batcher catches this
distinguishable subtype and triggers halve-and-retry (AC-2).
"""
def embed_batch(self, tiles: list[Any]) -> np.ndarray: ...
def descriptor_dim(self) -> int: ...