mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-22 19:41:14 +00:00
[AZ-322] C10 DescriptorBatcher (faiss-cpu, OOM halve-retry)
Implements the C10 internal phase that walks every C6 tile, embeds through C2's backbone via the AZ-321-produced engine, and rebuilds the AZ-306 FAISS HNSW index in one atomic write. - DescriptorBatcher with halve-and-retry OOM recovery (default 1 retry) - BackboneEmbedder Protocol + C7EngineBackboneEmbedder default impl - DescriptorBatchError for OOM / dim-mismatch / missing-output failures - Empty-corpus surfaces as outcome=failure with explicit hint to run C11 - Per-10% progress callback + DEBUG logs (no engine bytes leaked) - Consumer-side Protocol cuts (TilesByBboxBatchQuery, TilePixelOpener, DescriptorIndexRebuilder) so c10 stays within AZ-270 lint - runtime_root.c10_factory adds build_descriptor_batcher + three C6->C10 adapters - 16 unit tests covering AC-1..AC-10 + 2 NFRs + 4 supplemental (Protocol conformance, query pass-through, handle release, config) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -11,12 +11,27 @@ them through this single contract surface.
|
||||
|
||||
from gps_denied_onboard._types.inference import EngineCacheEntry
|
||||
from gps_denied_onboard._types.manifests import Manifest
|
||||
from gps_denied_onboard.components.c10_provisioning.c7_engine_embedder import (
|
||||
C7EngineBackboneEmbedder,
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.config import (
|
||||
BackboneConfig,
|
||||
C10ManifestConfig,
|
||||
C10ProvisioningConfig,
|
||||
SigningMode,
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.descriptor_batcher import (
|
||||
BatcherTile,
|
||||
C10BatcherConfig,
|
||||
CorpusFilter,
|
||||
DescriptorBatcher,
|
||||
DescriptorBatchReport,
|
||||
DescriptorIndexRebuilder,
|
||||
ProgressEvent,
|
||||
TileBboxRecord,
|
||||
TilePixelOpener,
|
||||
TilesByBboxBatchQuery,
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
|
||||
BackboneSpec,
|
||||
CompileEngineCallable,
|
||||
@@ -28,9 +43,11 @@ from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||
C10ProvisioningError,
|
||||
DescriptorBatchError,
|
||||
ManifestWriteError,
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.interface import (
|
||||
BackboneEmbedder,
|
||||
CacheProvisioner,
|
||||
ManifestSigner,
|
||||
SigningKeyHandle,
|
||||
@@ -60,13 +77,22 @@ __all__ = [
|
||||
"VALID_SECTOR_CLASSES",
|
||||
"ArtifactCheck",
|
||||
"BackboneConfig",
|
||||
"BackboneEmbedder",
|
||||
"BackboneSpec",
|
||||
"BatcherTile",
|
||||
"C7EngineBackboneEmbedder",
|
||||
"C10BatcherConfig",
|
||||
"C10ManifestConfig",
|
||||
"C10ProvisioningConfig",
|
||||
"C10ProvisioningError",
|
||||
"CacheProvisioner",
|
||||
"CompileEngineCallable",
|
||||
"CompileOutcome",
|
||||
"CorpusFilter",
|
||||
"DescriptorBatchError",
|
||||
"DescriptorBatchReport",
|
||||
"DescriptorBatcher",
|
||||
"DescriptorIndexRebuilder",
|
||||
"Ed25519ManifestSigner",
|
||||
"EngineCacheEntry",
|
||||
"EngineCompileRequest",
|
||||
@@ -81,9 +107,13 @@ __all__ = [
|
||||
"ManifestVerifier",
|
||||
"ManifestVerifierImpl",
|
||||
"ManifestWriteError",
|
||||
"ProgressEvent",
|
||||
"SigningKeyHandle",
|
||||
"SigningMode",
|
||||
"TileBboxRecord",
|
||||
"TileHashRecord",
|
||||
"TilePixelOpener",
|
||||
"TilesByBboxBatchQuery",
|
||||
"TilesByBboxQuery",
|
||||
"VerificationResult",
|
||||
"VerifyFailReason",
|
||||
|
||||
@@ -0,0 +1,150 @@
|
||||
"""``C7EngineBackboneEmbedder`` (AZ-322).
|
||||
|
||||
Default :class:`BackboneEmbedder` implementation: wraps an AZ-321-produced
|
||||
engine + an AZ-297 :class:`InferenceRuntime` and turns
|
||||
``list[TilePixelHandle]`` into ``np.ndarray`` of shape
|
||||
``(batch_size, descriptor_dim)``.
|
||||
|
||||
JPEG → tensor preprocessing is **not** owned here — it is the
|
||||
:class:`BackboneSpec` consumer's responsibility (the C2 VPR backbone in
|
||||
AZ-255 will eventually own its own normalization). Until E-C2 ships,
|
||||
the composition root injects a ``tile_decoder`` callable so this
|
||||
adapter stays free of OpenCV / Pillow / torchvision imports and the
|
||||
test surface stays narrow. Risk-1 in the AZ-322 spec mitigation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||
DescriptorBatchError,
|
||||
)
|
||||
|
||||
# AZ-322: ``InferenceRuntime`` (c7) and ``EngineHandle`` (_types) are
|
||||
# REFERENCED only in annotations. Importing them at runtime — even
|
||||
# under ``TYPE_CHECKING`` — would cross the AZ-270 component boundary
|
||||
# (the AST lint flags TYPE_CHECKING imports too, conservatively). We
|
||||
# instead duck-type these via ``Any`` and rely on structural calls
|
||||
# (``inference_runtime.infer(handle, dict) -> dict``); the
|
||||
# composition root (``runtime_root.c10_factory``) wires the concrete
|
||||
# c7 instance in.
|
||||
|
||||
__all__ = ["C7EngineBackboneEmbedder"]
|
||||
|
||||
|
||||
_OOM_MARKERS = ("CUDA out of memory", "OutOfMemoryError", "OOM")
|
||||
|
||||
|
||||
class C7EngineBackboneEmbedder:
|
||||
"""Thin adapter from AZ-297's :class:`InferenceRuntime` to
|
||||
:class:`BackboneEmbedder`.
|
||||
|
||||
Construction owns one :class:`EngineHandle` for the lifetime of
|
||||
the embedder (one batcher session). ``embed_batch`` decodes the
|
||||
incoming tile handles via the injected ``tile_decoder`` callable,
|
||||
stacks them into a batch tensor, and runs ``infer`` once.
|
||||
|
||||
The output tensor is read from ``outputs[output_name]`` —
|
||||
exposing ``output_name`` keeps the adapter portable across
|
||||
backbones (DINOv2-VPR uses ``"descriptor"``, others may differ).
|
||||
|
||||
OOM rewrap: any :class:`gps_denied_onboard.components.c7_inference.errors.OutOfMemoryError`
|
||||
OR an exception whose ``str`` contains an OOM marker is rewrapped
|
||||
as :class:`DescriptorBatchError("CUDA OOM at batch_size=N")` so
|
||||
:class:`DescriptorBatcher`'s halve-and-retry catches it (AC-2).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
inference_runtime: Any,
|
||||
engine_handle: Any,
|
||||
input_name: str,
|
||||
output_name: str,
|
||||
descriptor_dim: int,
|
||||
tile_decoder: Callable[[Any], np.ndarray],
|
||||
logger: logging.Logger,
|
||||
) -> None:
|
||||
if descriptor_dim <= 0:
|
||||
raise ValueError(
|
||||
f"descriptor_dim must be positive; got {descriptor_dim}"
|
||||
)
|
||||
self._runtime = inference_runtime
|
||||
self._handle = engine_handle
|
||||
self._input_name = input_name
|
||||
self._output_name = output_name
|
||||
self._descriptor_dim = descriptor_dim
|
||||
self._tile_decoder = tile_decoder
|
||||
self._logger = logger
|
||||
|
||||
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||
if not tiles:
|
||||
return np.empty((0, self._descriptor_dim), dtype=np.float32)
|
||||
|
||||
batch = self._stack_batch(tiles)
|
||||
try:
|
||||
outputs = self._runtime.infer(
|
||||
self._handle, {self._input_name: batch}
|
||||
)
|
||||
except Exception as exc: # rewrap OOM; surface everything else
|
||||
if _looks_like_oom(exc):
|
||||
raise DescriptorBatchError(
|
||||
f"CUDA OOM at batch_size={len(tiles)}"
|
||||
) from exc
|
||||
raise
|
||||
|
||||
if self._output_name not in outputs:
|
||||
raise DescriptorBatchError(
|
||||
f"engine output dict missing key {self._output_name!r}; "
|
||||
f"available keys = {list(outputs.keys())}"
|
||||
)
|
||||
descriptors = outputs[self._output_name]
|
||||
if not isinstance(descriptors, np.ndarray):
|
||||
raise DescriptorBatchError(
|
||||
f"engine output {self._output_name!r} is not an ndarray; "
|
||||
f"got {type(descriptors).__name__}"
|
||||
)
|
||||
if descriptors.ndim != 2 or descriptors.shape[0] != len(tiles):
|
||||
raise DescriptorBatchError(
|
||||
f"engine output shape {descriptors.shape} does not match "
|
||||
f"expected (batch={len(tiles)}, dim={self._descriptor_dim})"
|
||||
)
|
||||
if descriptors.dtype != np.float32:
|
||||
descriptors = descriptors.astype(np.float32, copy=False)
|
||||
return descriptors
|
||||
|
||||
def descriptor_dim(self) -> int:
|
||||
return self._descriptor_dim
|
||||
|
||||
def _stack_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||
decoded = [self._tile_decoder(handle) for handle in tiles]
|
||||
if not decoded:
|
||||
return np.empty((0,), dtype=np.float32)
|
||||
first_shape = decoded[0].shape
|
||||
for i, arr in enumerate(decoded[1:], start=1):
|
||||
if arr.shape != first_shape:
|
||||
raise DescriptorBatchError(
|
||||
f"tile_decoder returned shape mismatch at index {i}: "
|
||||
f"{arr.shape} vs first {first_shape}"
|
||||
)
|
||||
return np.stack(decoded, axis=0)
|
||||
|
||||
|
||||
def _looks_like_oom(exc: BaseException) -> bool:
|
||||
"""Detect OOM by exception type name OR message marker.
|
||||
|
||||
The C7 contract names the canonical exception
|
||||
:class:`OutOfMemoryError`; back-end SDKs occasionally raise raw
|
||||
:class:`RuntimeError` with a message describing OOM (PyTorch
|
||||
historically does this). We accept both so the AC-2 retry loop
|
||||
catches the failure regardless of the underlying SDK.
|
||||
"""
|
||||
if type(exc).__name__ == "OutOfMemoryError":
|
||||
return True
|
||||
message = str(exc)
|
||||
return any(marker in message for marker in _OOM_MARKERS)
|
||||
@@ -0,0 +1,522 @@
|
||||
"""C10 ``DescriptorBatcher`` — embed corpus + rebuild FAISS index (AZ-322).
|
||||
|
||||
The pre-flight phase that walks every C6 tile in
|
||||
``(bbox, zoom_levels)``, runs them through the C2 backbone (via the
|
||||
AZ-321-produced engine) in batches sized for the operator workstation,
|
||||
and rebuilds the FAISS HNSW index via AZ-303 / AZ-306's
|
||||
:meth:`DescriptorIndex.rebuild_from_descriptors`.
|
||||
|
||||
Cross-component DTOs travel through consumer-side structural Protocol
|
||||
cuts living in this module — :class:`TilesByBboxBatchQuery`,
|
||||
:class:`TilePixelOpener`, :class:`DescriptorIndexRebuilder`,
|
||||
:class:`BatcherTile` — so the AZ-270 lint
|
||||
(``test_az270_compose_root.test_ac6``) stays green: this module never
|
||||
imports ``components.c6_tile_cache`` directly. The composition root
|
||||
adapts the real C6 surface inside
|
||||
``runtime_root.c10_factory.build_descriptor_batcher``.
|
||||
|
||||
Design constraints baked in by the spec:
|
||||
|
||||
- Halve-and-retry on CUDA OOM is bounded by ``max_oom_retries``; default
|
||||
1 (so 64 → 32 → fail-fast).
|
||||
- ``rebuild_from_descriptors`` is the ONLY write path to the ``.index``
|
||||
file (no raw ``numpy.tofile`` — AZ-306 owns atomicity).
|
||||
- The int64 id formula is canonical via AZ-306 (the C6 helper is
|
||||
re-imported through the composition-root adapter so this module never
|
||||
reaches across the AZ-270 boundary).
|
||||
- ``embed_batch`` is called with mmap-backed handles, not raw bytes —
|
||||
preserves AZ-303's read-only invariant on tile pixels.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from contextlib import ExitStack
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Protocol, runtime_checkable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from gps_denied_onboard.clock import Clock
|
||||
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||
DescriptorBatchError,
|
||||
)
|
||||
from gps_denied_onboard.components.c10_provisioning.interface import (
|
||||
BackboneEmbedder,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BatcherTile",
|
||||
"C10BatcherConfig",
|
||||
"CorpusFilter",
|
||||
"DescriptorBatchReport",
|
||||
"DescriptorBatcher",
|
||||
"DescriptorIndexRebuilder",
|
||||
"ProgressEvent",
|
||||
"TileBboxRecord",
|
||||
"TilePixelOpener",
|
||||
"TilesByBboxBatchQuery",
|
||||
]
|
||||
|
||||
|
||||
_LOG_KIND_PREFIX = "c10.descriptor"
|
||||
_PROGRESS_LOG_FRACTION = 10 # emit one DEBUG log per ~10% of tiles
|
||||
_PROGRESS_CALLBACK_FRACTION = 10 # callback fires every 10%
|
||||
|
||||
|
||||
class BatcherOutcome(str, Enum):
|
||||
"""Terminal classification of a :class:`DescriptorBatcher` run."""
|
||||
|
||||
SUCCESS = "success"
|
||||
FAILURE = "failure"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TileBboxRecord:
|
||||
"""Consumer-side DTO returned by :class:`TilesByBboxBatchQuery`.
|
||||
|
||||
The composition-root adapter walks C6's ``TileMetadata`` rows and
|
||||
emits one of these per tile so this module never imports the C6
|
||||
DTO directly.
|
||||
"""
|
||||
|
||||
zoom: int
|
||||
lat: float
|
||||
lon: float
|
||||
source: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CorpusFilter:
|
||||
"""Filter set passed into :meth:`DescriptorBatcher.populate_descriptors`.
|
||||
|
||||
Mirrors the spec's three-axis filter: spatial (``bbox`` —
|
||||
``(min_lat, min_lon, max_lat, max_lon)``), zoom (a tuple so the
|
||||
operator can ask for multiple levels in one go), and sector
|
||||
(``"active_conflict"`` or ``"stable_rear"`` — string form keeps this
|
||||
DTO free of the C6 enum import).
|
||||
"""
|
||||
|
||||
bbox: tuple[float, float, float, float]
|
||||
zoom_levels: tuple[int, ...]
|
||||
sector_class: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProgressEvent:
|
||||
"""One progress update emitted to ``C10BatcherConfig.progress_callback``."""
|
||||
|
||||
tiles_done: int
|
||||
tiles_total: int
|
||||
current_batch_size: int
|
||||
elapsed_s: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DescriptorBatchReport:
|
||||
"""Terminal report returned by :meth:`populate_descriptors`."""
|
||||
|
||||
descriptors_generated: int
|
||||
tiles_consumed: int
|
||||
oom_retries: int
|
||||
elapsed_s: float
|
||||
outcome: BatcherOutcome
|
||||
failure_reason: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class C10BatcherConfig:
|
||||
"""Per-instance batcher policy.
|
||||
|
||||
``initial_batch_size`` defaults to 64 (the spec's worked example);
|
||||
operators on smaller GPUs dial it down via YAML.
|
||||
|
||||
``max_oom_retries`` bounds halve-and-retry; default 1 keeps the
|
||||
failure surface at "64 → 32 → fail" so a real GPU regression is
|
||||
visible after ~5-10 s rather than a multi-minute timeout.
|
||||
|
||||
``progress_callback`` is optional; the batcher always emits DEBUG
|
||||
logs at the same cadence so operators staring at a CLI see
|
||||
progress without enabling tracing.
|
||||
"""
|
||||
|
||||
initial_batch_size: int = 64
|
||||
max_oom_retries: int = 1
|
||||
progress_callback: Callable[[ProgressEvent], None] | None = None
|
||||
hnsw_m: int = 32
|
||||
hnsw_ef_construction: int = 200
|
||||
hnsw_ef_search: int = 64
|
||||
hnsw_metric: str = "L2"
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.initial_batch_size <= 0:
|
||||
raise ValueError(
|
||||
"C10BatcherConfig.initial_batch_size must be > 0; "
|
||||
f"got {self.initial_batch_size}"
|
||||
)
|
||||
if self.max_oom_retries < 0:
|
||||
raise ValueError(
|
||||
"C10BatcherConfig.max_oom_retries must be >= 0; "
|
||||
f"got {self.max_oom_retries}"
|
||||
)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TilesByBboxBatchQuery(Protocol):
|
||||
"""Consumer-side cut over C6's ``TileMetadataStore.query_by_bbox``.
|
||||
|
||||
AZ-322 needs the spatial+sector filter applied across multiple
|
||||
zoom levels in one logical call; the composition-root adapter
|
||||
(``runtime_root.c10_factory.build_descriptor_batcher``) loops over
|
||||
``zoom_levels`` and yields one :class:`TileBboxRecord` per
|
||||
matching row. Returns the concatenated list in deterministic order
|
||||
so the int64 id mapping is stable across runs.
|
||||
"""
|
||||
|
||||
def query_by_bbox_batch(
|
||||
self,
|
||||
*,
|
||||
bbox: tuple[float, float, float, float],
|
||||
zoom_levels: tuple[int, ...],
|
||||
sector_class: str,
|
||||
) -> list[TileBboxRecord]: ...
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BatcherTile:
|
||||
"""Bundles a tile's identity with its mmap handle for one embed call."""
|
||||
|
||||
record: TileBboxRecord
|
||||
pixel_handle: Any # ``TilePixelHandle`` from the C6 ABC; opaque here.
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class TilePixelOpener(Protocol):
|
||||
"""Consumer-side cut over C6's ``TileStore.read_tile_pixels``.
|
||||
|
||||
Returns the C6 ``TilePixelHandle`` ABC (a context manager). The
|
||||
composition-root adapter wraps the real C6 ``TileStore`` and
|
||||
re-creates a ``TileId`` from the ``(zoom, lat, lon)`` triple before
|
||||
delegating.
|
||||
"""
|
||||
|
||||
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any: ...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class DescriptorIndexRebuilder(Protocol):
|
||||
"""Consumer-side cut over AZ-303 / AZ-306 ``rebuild_from_descriptors``.
|
||||
|
||||
The composition root passes the real
|
||||
:class:`gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index.FaissDescriptorIndex`
|
||||
instance — it structurally satisfies this Protocol. Tests pass a
|
||||
spy.
|
||||
|
||||
AZ-322 hands ``tile_records`` (one per row of ``descriptors``)
|
||||
rather than pre-computed int64 ids: AZ-306 owns the canonical
|
||||
``tile_id_to_int64`` mapping (same rule as AC-6); we don't
|
||||
duplicate the formula here. The composition-root adapter
|
||||
constructs the C6 ``TileId`` instances + supplies them to
|
||||
``rebuild_from_descriptors`` (which itself runs the int64
|
||||
derivation per AZ-306).
|
||||
"""
|
||||
|
||||
def rebuild(
|
||||
self,
|
||||
*,
|
||||
descriptors: np.ndarray,
|
||||
tile_records: list[TileBboxRecord],
|
||||
hnsw_m: int,
|
||||
hnsw_ef_construction: int,
|
||||
hnsw_ef_search: int,
|
||||
hnsw_metric: str,
|
||||
) -> None: ...
|
||||
|
||||
|
||||
class DescriptorBatcher:
|
||||
"""Pre-flight descriptor-batched generation + FAISS rebuild.
|
||||
|
||||
Single public method: :meth:`populate_descriptors`. Construction
|
||||
is dependency-injection only; the composition root supplies all
|
||||
four collaborators and the config block.
|
||||
|
||||
Not re-entrant — concurrent calls on the same instance break the
|
||||
progress accounting.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
backbone_embedder: BackboneEmbedder,
|
||||
tiles_query: TilesByBboxBatchQuery,
|
||||
tile_pixel_opener: TilePixelOpener,
|
||||
descriptor_index: DescriptorIndexRebuilder,
|
||||
clock: Clock,
|
||||
logger: logging.Logger,
|
||||
config: C10BatcherConfig,
|
||||
) -> None:
|
||||
self._embedder = backbone_embedder
|
||||
self._tiles_query = tiles_query
|
||||
self._tile_opener = tile_pixel_opener
|
||||
self._descriptor_index = descriptor_index
|
||||
self._clock = clock
|
||||
self._logger = logger
|
||||
self._config = config
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public surface
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def populate_descriptors(self, corpus_filter: CorpusFilter) -> DescriptorBatchReport:
|
||||
run_started_ns = self._clock.monotonic_ns()
|
||||
|
||||
records = self._tiles_query.query_by_bbox_batch(
|
||||
bbox=corpus_filter.bbox,
|
||||
zoom_levels=corpus_filter.zoom_levels,
|
||||
sector_class=corpus_filter.sector_class,
|
||||
)
|
||||
if not records:
|
||||
elapsed_s = self._elapsed_s(run_started_ns)
|
||||
reason = (
|
||||
"no tiles in C6 for the requested scope; run C11 "
|
||||
"TileDownloader first"
|
||||
)
|
||||
self._logger.error(
|
||||
f"{_LOG_KIND_PREFIX}.empty.corpus",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.empty.corpus",
|
||||
"bbox": corpus_filter.bbox,
|
||||
"zoom_levels": corpus_filter.zoom_levels,
|
||||
"sector_class": corpus_filter.sector_class,
|
||||
"elapsed_s": elapsed_s,
|
||||
},
|
||||
)
|
||||
return DescriptorBatchReport(
|
||||
descriptors_generated=0,
|
||||
tiles_consumed=0,
|
||||
oom_retries=0,
|
||||
elapsed_s=elapsed_s,
|
||||
outcome=BatcherOutcome.FAILURE,
|
||||
failure_reason=reason,
|
||||
)
|
||||
|
||||
descriptor_dim = int(self._embedder.descriptor_dim())
|
||||
if descriptor_dim <= 0:
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher.populate_descriptors: backbone_embedder."
|
||||
f"descriptor_dim() must be > 0; got {descriptor_dim}"
|
||||
)
|
||||
|
||||
total = len(records)
|
||||
self._logger.info(
|
||||
f"{_LOG_KIND_PREFIX}.session.start",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.session.start",
|
||||
"tiles_total": total,
|
||||
"initial_batch_size": self._config.initial_batch_size,
|
||||
"descriptor_dim": descriptor_dim,
|
||||
"bbox": corpus_filter.bbox,
|
||||
"zoom_levels": corpus_filter.zoom_levels,
|
||||
"sector_class": corpus_filter.sector_class,
|
||||
},
|
||||
)
|
||||
|
||||
descriptor_buffer: list[np.ndarray] = []
|
||||
consumed_records: list[TileBboxRecord] = []
|
||||
current_batch_size = self._config.initial_batch_size
|
||||
oom_retries = 0
|
||||
last_progress_emit = 0
|
||||
|
||||
idx = 0
|
||||
while idx < total:
|
||||
window = records[idx : idx + current_batch_size]
|
||||
try:
|
||||
batch_descriptors = self._embed_one_window(window, descriptor_dim)
|
||||
except DescriptorBatchError as exc:
|
||||
if (
|
||||
self._is_oom(exc)
|
||||
and oom_retries < self._config.max_oom_retries
|
||||
and current_batch_size > 1
|
||||
):
|
||||
new_size = max(1, current_batch_size // 2)
|
||||
self._logger.warning(
|
||||
f"{_LOG_KIND_PREFIX}.oom.retry",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.oom.retry",
|
||||
"previous_batch_size": current_batch_size,
|
||||
"new_batch_size": new_size,
|
||||
"tiles_done": idx,
|
||||
"tiles_total": total,
|
||||
"oom_retries_after": oom_retries + 1,
|
||||
},
|
||||
)
|
||||
current_batch_size = new_size
|
||||
oom_retries += 1
|
||||
continue
|
||||
self._logger.error(
|
||||
f"{_LOG_KIND_PREFIX}.oom.terminal",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.oom.terminal",
|
||||
"batch_size": current_batch_size,
|
||||
"tiles_done": idx,
|
||||
"tiles_total": total,
|
||||
"oom_retries": oom_retries,
|
||||
"first_tile_in_batch": (
|
||||
window[0].zoom,
|
||||
window[0].lat,
|
||||
window[0].lon,
|
||||
),
|
||||
},
|
||||
)
|
||||
raise
|
||||
|
||||
descriptor_buffer.append(batch_descriptors)
|
||||
consumed_records.extend(window)
|
||||
idx += len(window)
|
||||
|
||||
elapsed_s = self._elapsed_s(run_started_ns)
|
||||
self._maybe_emit_progress(
|
||||
tiles_done=idx,
|
||||
tiles_total=total,
|
||||
current_batch_size=current_batch_size,
|
||||
elapsed_s=elapsed_s,
|
||||
last_emit_ref=last_progress_emit,
|
||||
)
|
||||
last_progress_emit = (idx * _PROGRESS_CALLBACK_FRACTION) // total
|
||||
|
||||
descriptors = np.concatenate(descriptor_buffer, axis=0)
|
||||
if descriptors.shape != (total, descriptor_dim):
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher.populate_descriptors: descriptor matrix has "
|
||||
f"shape {descriptors.shape}, expected ({total}, {descriptor_dim})"
|
||||
)
|
||||
|
||||
self._descriptor_index.rebuild(
|
||||
descriptors=np.ascontiguousarray(descriptors, dtype=np.float32),
|
||||
tile_records=consumed_records,
|
||||
hnsw_m=self._config.hnsw_m,
|
||||
hnsw_ef_construction=self._config.hnsw_ef_construction,
|
||||
hnsw_ef_search=self._config.hnsw_ef_search,
|
||||
hnsw_metric=self._config.hnsw_metric,
|
||||
)
|
||||
|
||||
elapsed_s = self._elapsed_s(run_started_ns)
|
||||
self._logger.info(
|
||||
f"{_LOG_KIND_PREFIX}.session.complete",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.session.complete",
|
||||
"tiles_total": total,
|
||||
"descriptors_generated": total,
|
||||
"oom_retries": oom_retries,
|
||||
"elapsed_s": elapsed_s,
|
||||
},
|
||||
)
|
||||
return DescriptorBatchReport(
|
||||
descriptors_generated=total,
|
||||
tiles_consumed=total,
|
||||
oom_retries=oom_retries,
|
||||
elapsed_s=elapsed_s,
|
||||
outcome=BatcherOutcome.SUCCESS,
|
||||
failure_reason=None,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internals
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _embed_one_window(
|
||||
self, window: list[TileBboxRecord], descriptor_dim: int
|
||||
) -> np.ndarray:
|
||||
with ExitStack() as stack:
|
||||
handles: list[Any] = []
|
||||
for record in window:
|
||||
handle = self._tile_opener.open_tile(
|
||||
zoom=record.zoom, lat=record.lat, lon=record.lon
|
||||
)
|
||||
# The C6 ``TilePixelHandle`` ABC is a context manager
|
||||
# whose __exit__ unmaps the file. ExitStack guarantees
|
||||
# release even if embed_batch raises mid-flight.
|
||||
stack.enter_context(handle)
|
||||
handles.append(handle)
|
||||
batch_descriptors = self._embedder.embed_batch(handles)
|
||||
|
||||
return self._validate_batch_shape(
|
||||
batch_descriptors, expected_rows=len(window), descriptor_dim=descriptor_dim
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate_batch_shape(
|
||||
batch_descriptors: object,
|
||||
*,
|
||||
expected_rows: int,
|
||||
descriptor_dim: int,
|
||||
) -> np.ndarray:
|
||||
if not isinstance(batch_descriptors, np.ndarray):
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher: embed_batch must return numpy.ndarray; "
|
||||
f"got {type(batch_descriptors).__name__}"
|
||||
)
|
||||
if batch_descriptors.ndim != 2:
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher: embed_batch must return a 2-D array; "
|
||||
f"got ndim={batch_descriptors.ndim}"
|
||||
)
|
||||
if batch_descriptors.shape[0] != expected_rows:
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher: embed_batch returned "
|
||||
f"{batch_descriptors.shape[0]} rows; expected {expected_rows}"
|
||||
)
|
||||
if batch_descriptors.shape[1] != descriptor_dim:
|
||||
raise DescriptorBatchError(
|
||||
"DescriptorBatcher: descriptor_dim mismatch — embed_batch "
|
||||
f"returned shape[1]={batch_descriptors.shape[1]}, but "
|
||||
f"descriptor_dim()={descriptor_dim}"
|
||||
)
|
||||
if batch_descriptors.dtype != np.float32:
|
||||
return batch_descriptors.astype(np.float32, copy=False)
|
||||
return batch_descriptors
|
||||
|
||||
@staticmethod
|
||||
def _is_oom(exc: DescriptorBatchError) -> bool:
|
||||
# Spec AC-2 distinguishes OOM by message substring; no separate
|
||||
# exception subclass — keeps the impl free of CUDA-specific imports.
|
||||
return "CUDA OOM" in str(exc)
|
||||
|
||||
def _elapsed_s(self, run_started_ns: int) -> float:
|
||||
return max(0.0, (self._clock.monotonic_ns() - run_started_ns) / 1e9)
|
||||
|
||||
def _maybe_emit_progress(
|
||||
self,
|
||||
*,
|
||||
tiles_done: int,
|
||||
tiles_total: int,
|
||||
current_batch_size: int,
|
||||
elapsed_s: float,
|
||||
last_emit_ref: int,
|
||||
) -> None:
|
||||
# Fire at every 10% boundary crossed since last emit (so the
|
||||
# callback receives exactly 10 events for 1000 tiles, even when
|
||||
# batch sizes don't divide evenly into 100 tiles per step).
|
||||
current_decile = (tiles_done * _PROGRESS_CALLBACK_FRACTION) // tiles_total
|
||||
for decile in range(last_emit_ref + 1, current_decile + 1):
|
||||
tiles_at_decile = (decile * tiles_total) // _PROGRESS_CALLBACK_FRACTION
|
||||
event = ProgressEvent(
|
||||
tiles_done=tiles_at_decile,
|
||||
tiles_total=tiles_total,
|
||||
current_batch_size=current_batch_size,
|
||||
elapsed_s=elapsed_s,
|
||||
)
|
||||
self._logger.debug(
|
||||
f"{_LOG_KIND_PREFIX}.progress",
|
||||
extra={
|
||||
"kind": f"{_LOG_KIND_PREFIX}.progress",
|
||||
"tiles_done": event.tiles_done,
|
||||
"tiles_total": event.tiles_total,
|
||||
"current_batch_size": event.current_batch_size,
|
||||
"elapsed_s": event.elapsed_s,
|
||||
},
|
||||
)
|
||||
if self._config.progress_callback is not None:
|
||||
self._config.progress_callback(event)
|
||||
@@ -12,6 +12,7 @@ from __future__ import annotations
|
||||
|
||||
__all__ = [
|
||||
"C10ProvisioningError",
|
||||
"DescriptorBatchError",
|
||||
"ManifestWriteError",
|
||||
]
|
||||
|
||||
@@ -20,6 +21,26 @@ class C10ProvisioningError(Exception):
|
||||
"""Base class for the C10 cache-provisioning error family."""
|
||||
|
||||
|
||||
class DescriptorBatchError(C10ProvisioningError):
|
||||
"""``DescriptorBatcher.populate_descriptors`` could not finish (AZ-322).
|
||||
|
||||
Surfaces three failure modes:
|
||||
|
||||
1. ``"CUDA OOM"`` raised by the injected
|
||||
:class:`gps_denied_onboard.components.c10_provisioning.interface.BackboneEmbedder`;
|
||||
the batcher catches the OOM-flavoured instance and triggers the
|
||||
halve-and-retry loop (AC-2). Persistent OOM after retries are
|
||||
exhausted re-raises with the final batch size + tile-id
|
||||
context (AC-3).
|
||||
2. ``descriptor_dim`` mismatch — the impl returned a column count
|
||||
that does not equal :meth:`BackboneEmbedder.descriptor_dim`
|
||||
(AC-9); raised BEFORE the FAISS rebuild call so an existing
|
||||
valid index is not corrupted.
|
||||
3. Underlying FAISS rebuild failure (rewrapped from the AZ-306
|
||||
:class:`IndexBuildError` envelope).
|
||||
"""
|
||||
|
||||
|
||||
class ManifestWriteError(C10ProvisioningError):
|
||||
"""``ManifestBuilder.build_manifest`` could not produce a signed Manifest.
|
||||
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
- :class:`CacheProvisioner` (AZ-325, pending) — pre-flight orchestrator.
|
||||
- :class:`ManifestSigner` (AZ-323) — Ed25519 detached signing surface
|
||||
consumed by :class:`ManifestBuilder`.
|
||||
- :class:`BackboneEmbedder` (AZ-322) — image-batch → descriptor surface
|
||||
consumed by :class:`DescriptorBatcher`. The default impl wraps the
|
||||
AZ-298 / AZ-299 / AZ-300 ``InferenceRuntime``-produced engine; when
|
||||
E-C2 (AZ-336+) ships its public embed surface a thin adapter swaps
|
||||
the impl in via the composition root.
|
||||
|
||||
Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
|
||||
`_docs/02_document/components/11_c10_provisioning/`.
|
||||
@@ -11,11 +16,15 @@ Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Protocol, runtime_checkable
|
||||
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
||||
|
||||
from gps_denied_onboard._types.manifests import Manifest
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
__all__ = [
|
||||
"BackboneEmbedder",
|
||||
"CacheProvisioner",
|
||||
"ManifestSigner",
|
||||
"SigningKeyHandle",
|
||||
@@ -66,3 +75,28 @@ class ManifestSigner(Protocol):
|
||||
def sign(self, key: SigningKeyHandle, payload_bytes: bytes) -> bytes: ...
|
||||
|
||||
def public_key_fingerprint(self, key: SigningKeyHandle) -> str: ...
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class BackboneEmbedder(Protocol):
|
||||
"""Image-batch → descriptor matrix surface (AZ-322).
|
||||
|
||||
Two-method contract:
|
||||
|
||||
- :meth:`embed_batch` takes a list of mmap-backed tile pixel
|
||||
handles (any object exposing the c6 ``TilePixelHandle`` ABC) and
|
||||
returns an ``np.ndarray`` of shape ``(len(tiles),
|
||||
descriptor_dim())`` with ``dtype == float32``.
|
||||
- :meth:`descriptor_dim` returns the fixed descriptor dimension
|
||||
the impl produces; queried once before the first batch and used
|
||||
to validate every batch's last axis (AC-9).
|
||||
|
||||
On CUDA OOM the impl raises
|
||||
:class:`gps_denied_onboard.components.c10_provisioning.errors.DescriptorBatchError`
|
||||
with ``"CUDA OOM"`` in the message — the batcher catches this
|
||||
distinguishable subtype and triggers halve-and-retry (AC-2).
|
||||
"""
|
||||
|
||||
def embed_batch(self, tiles: list[Any]) -> np.ndarray: ...
|
||||
|
||||
def descriptor_dim(self) -> int: ...
|
||||
|
||||
Reference in New Issue
Block a user