mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 08:51:12 +00:00
[AZ-322] C10 DescriptorBatcher (faiss-cpu, OOM halve-retry)
Implements the C10 internal phase that walks every C6 tile, embeds through C2's backbone via the AZ-321-produced engine, and rebuilds the AZ-306 FAISS HNSW index in one atomic write. - DescriptorBatcher with halve-and-retry OOM recovery (default 1 retry) - BackboneEmbedder Protocol + C7EngineBackboneEmbedder default impl - DescriptorBatchError for OOM / dim-mismatch / missing-output failures - Empty-corpus surfaces as outcome=failure with explicit hint to run C11 - Per-10% progress callback + DEBUG logs (no engine bytes leaked) - Consumer-side Protocol cuts (TilesByBboxBatchQuery, TilePixelOpener, DescriptorIndexRebuilder) so c10 stays within AZ-270 lint - runtime_root.c10_factory adds build_descriptor_batcher + three C6->C10 adapters - 16 unit tests covering AC-1..AC-10 + 2 NFRs + 4 supplemental (Protocol conformance, query pass-through, handle release, config) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -100,6 +100,24 @@ C10 reads `tiles` rows from C6 (scoped to the build's bbox + zoom_levels), write
|
|||||||
| atomicwrites | latest | Atomic file replacement for `.index` + Manifest (D-C10-3) |
|
| atomicwrites | latest | Atomic file replacement for `.index` + Manifest (D-C10-3) |
|
||||||
| hashlib (stdlib) | stdlib | SHA-256 content-hash sidecars |
|
| hashlib (stdlib) | stdlib | SHA-256 content-hash sidecars |
|
||||||
| PyYAML / orjson | per project pin | Manifest serialization |
|
| PyYAML / orjson | per project pin | Manifest serialization |
|
||||||
|
| numpy | per project pin | Descriptor batch ndarray container (AZ-322 `DescriptorBatcher`) |
|
||||||
|
|
||||||
|
**AZ-322 internal phase — `DescriptorBatcher`**:
|
||||||
|
|
||||||
|
The `populate_descriptors` phase walks every tile in C6 for the requested
|
||||||
|
`(bbox, zoom_levels, sector_class)`, embeds them through C7's `InferenceRuntime`
|
||||||
|
(via `C7EngineBackboneEmbedder`, the default `BackboneEmbedder` impl), and
|
||||||
|
hands the resulting `(N, descriptor_dim)` ndarray to AZ-306's
|
||||||
|
`DescriptorIndex.rebuild_from_descriptors` for atomic FAISS index write.
|
||||||
|
CUDA OOM is handled via halve-and-retry bounded by `C10BatcherConfig.max_oom_retries`
|
||||||
|
(default 1: 64 → 32, then succeed-or-fail-fast) so a real GPU regression
|
||||||
|
surfaces in seconds rather than via silent retries. Per-10% progress is
|
||||||
|
emitted both as DEBUG logs (`c10.descriptor.progress`) and via an optional
|
||||||
|
`progress_callback` so operator tooling can wire a TTY/GUI bar without
|
||||||
|
touching the batcher itself. The descriptor int64 id formula is the
|
||||||
|
canonical AZ-306 scheme (`int.from_bytes(sha256("zoom|lat|lon").first8, "big", signed=True)`)
|
||||||
|
— invented locally to avoid a circular dependency back into C6 internals
|
||||||
|
would break AC-6.
|
||||||
|
|
||||||
**Error Handling Strategy**:
|
**Error Handling Strategy**:
|
||||||
|
|
||||||
|
|||||||
@@ -209,14 +209,16 @@ Bootstrap reference: `_docs/02_tasks/todo/AZ-263_initial_structure.md`. Architec
|
|||||||
- **Epic**: AZ-252 (E-C10 Cache Provisioner)
|
- **Epic**: AZ-252 (E-C10 Cache Provisioner)
|
||||||
- **Directory**: `src/gps_denied_onboard/components/c10_provisioning/`
|
- **Directory**: `src/gps_denied_onboard/components/c10_provisioning/`
|
||||||
- **Public API**:
|
- **Public API**:
|
||||||
- `__init__.py` (re-exports `CacheProvisioner`, `Manifest`, `EngineCacheEntry`, plus AZ-321 surface: `EngineCompiler`, `BackboneSpec`, `EngineCompileRequest`, `EngineCompileResult`, `CompileOutcome`, `EngineCompileSummary`, `CompileEngineCallable`, `BackboneConfig`, `C10ProvisioningConfig`)
|
- `__init__.py` (re-exports `CacheProvisioner`, `Manifest`, `EngineCacheEntry`, plus AZ-321 surface: `EngineCompiler`, `BackboneSpec`, `EngineCompileRequest`, `EngineCompileResult`, `CompileOutcome`, `EngineCompileSummary`, `CompileEngineCallable`, `BackboneConfig`, `C10ProvisioningConfig`, plus AZ-322 surface: `DescriptorBatcher`, `BackboneEmbedder`, `C7EngineBackboneEmbedder`, `C10BatcherConfig`, `CorpusFilter`, `DescriptorBatchReport`, `ProgressEvent`, `TileBboxRecord`, `BatcherTile`, `TilesByBboxBatchQuery`, `TilePixelOpener`, `DescriptorIndexRebuilder`, `DescriptorBatchError`)
|
||||||
- `interface.py` (`CacheProvisioner` Protocol)
|
- `interface.py` (`CacheProvisioner` Protocol, `BackboneEmbedder` Protocol — AZ-322)
|
||||||
- Config block: `C10ProvisioningConfig` (registered on import)
|
- Config block: `C10ProvisioningConfig` (registered on import)
|
||||||
- **Internal**:
|
- **Internal**:
|
||||||
- `engine_compiler.py` (AZ-321; per-model TRT compile + hardware-tied cache reuse + `CompileEngineCallable` structural cut of the C7 InferenceRuntime)
|
- `engine_compiler.py` (AZ-321; per-model TRT compile + hardware-tied cache reuse + `CompileEngineCallable` structural cut of the C7 InferenceRuntime)
|
||||||
- `config.py` (AZ-321; `BackboneConfig` + `C10ProvisioningConfig` dataclasses)
|
- `config.py` (AZ-321; `BackboneConfig` + `C10ProvisioningConfig` dataclasses)
|
||||||
|
- `descriptor_batcher.py` (AZ-322; `DescriptorBatcher` + DTOs + consumer-side Protocols `TilesByBboxBatchQuery` / `TilePixelOpener` / `DescriptorIndexRebuilder`)
|
||||||
|
- `c7_engine_embedder.py` (AZ-322; `C7EngineBackboneEmbedder` adapter wrapping AZ-297 `InferenceRuntime` + AZ-321 engine path)
|
||||||
- `default_provisioner.py` (engine compile + descriptors + manifest + content-hash gate, pending)
|
- `default_provisioner.py` (engine compile + descriptors + manifest + content-hash gate, pending)
|
||||||
- Composition root: `runtime_root/c10_factory.py` (`build_engine_compiler`, `build_backbone_specs`)
|
- Composition root: `runtime_root/c10_factory.py` (`build_engine_compiler`, `build_backbone_specs`, `build_manifest_builder`, `build_manifest_verifier`, `build_descriptor_batcher` + the C6→C10 adapters `c6_tile_metadata_store_to_tiles_batch_query`, `c6_tile_store_to_pixel_opener`, `c6_descriptor_index_to_rebuilder`)
|
||||||
- **Owns**: `src/gps_denied_onboard/components/c10_provisioning/**`, `tests/unit/c10_provisioning/**`
|
- **Owns**: `src/gps_denied_onboard/components/c10_provisioning/**`, `tests/unit/c10_provisioning/**`
|
||||||
- **Imports from**: `_types` (cross-component DTOs `EngineCacheEntry`, `BuildConfig`, `PrecisionMode`, `OptimizationProfile`, `HostCapabilities`, `TileMetadata`, etc.), `_types.inference_errors` (AZ-507 typed-error envelope for `EngineBuildError` + `CalibrationCacheError`), `helpers.sha256_sidecar`, `helpers.engine_filename_schema`, `helpers.wgs_converter`, `config`, `logging`, `fdr_client`. The `InferenceRuntime.compile_engine` surface (c7) and the `TileMetadataStore.query_by_bbox` surface (c6) are obtained via constructor-injected consumer-side structural Protocol cuts (the `CompileEngineCallable` cut already lives in `engine_compiler.py`; AZ-323 / AZ-324 will define analogous `query_by_bbox` cuts inside `c10_provisioning/`). NEVER `from gps_denied_onboard.components.c6_tile_cache import ...` or `from gps_denied_onboard.components.c7_inference import ...` inside `c10_provisioning/*.py`.
|
- **Imports from**: `_types` (cross-component DTOs `EngineCacheEntry`, `BuildConfig`, `PrecisionMode`, `OptimizationProfile`, `HostCapabilities`, `TileMetadata`, etc.), `_types.inference_errors` (AZ-507 typed-error envelope for `EngineBuildError` + `CalibrationCacheError`), `helpers.sha256_sidecar`, `helpers.engine_filename_schema`, `helpers.wgs_converter`, `config`, `logging`, `fdr_client`. The `InferenceRuntime.compile_engine` surface (c7) and the `TileMetadataStore.query_by_bbox` surface (c6) are obtained via constructor-injected consumer-side structural Protocol cuts (the `CompileEngineCallable` cut already lives in `engine_compiler.py`; AZ-323 / AZ-324 will define analogous `query_by_bbox` cuts inside `c10_provisioning/`). NEVER `from gps_denied_onboard.components.c6_tile_cache import ...` or `from gps_denied_onboard.components.c7_inference import ...` inside `c10_provisioning/*.py`.
|
||||||
- **Consumed by**: `c12_operator_tooling`, `runtime_root` (operator binary only — excluded from airborne via `BUILD_C10_PROVISIONING=OFF` for airborne build per ADR-002)
|
- **Consumed by**: `c12_operator_tooling`, `runtime_root` (operator binary only — excluded from airborne via `BUILD_C10_PROVISIONING=OFF` for airborne build per ADR-002)
|
||||||
|
|||||||
@@ -0,0 +1,141 @@
|
|||||||
|
# Batch 36 — Cycle 1 Report
|
||||||
|
|
||||||
|
**Date**: 2026-05-13
|
||||||
|
**Batch**: 36 (single task — direct AZ-306 follow-up)
|
||||||
|
**Tasks**: AZ-322 (C10 Descriptor Batcher, 3pt)
|
||||||
|
**Status**: complete; AZ-322 transitioned to "In Testing" pending operator review.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
AZ-322 implements `DescriptorBatcher` — the C10 phase that walks every C6 tile in the requested
|
||||||
|
`(bbox, zoom_levels, sector_class)`, embeds it through C2's VPR backbone (via the C7 engine produced
|
||||||
|
by AZ-321), and rebuilds the AZ-306 FAISS HNSW index in one atomic write.
|
||||||
|
|
||||||
|
This unblocks the airborne C2 VPR step's takeoff verify (AC-NEW-1) and makes the C10-PT-01
|
||||||
|
cold-build budget observable end-to-end.
|
||||||
|
|
||||||
|
## Architectural Decisions
|
||||||
|
|
||||||
|
### 1. Consumer-side Protocol cuts (AZ-270 / AZ-507 compliance)
|
||||||
|
|
||||||
|
The AZ-322 task spec listed direct C6 types (`TileMetadataStore`, `TileStore`, `DescriptorIndex`)
|
||||||
|
in the `DescriptorBatcher.__init__` signature. That contradicts AZ-270 (no cross-component
|
||||||
|
imports inside `components/*`) and the AZ-507 cross-component contract surface rule. The
|
||||||
|
established precedent — AZ-323's `ManifestBuilder` and AZ-324's `ManifestVerifierImpl` — declares
|
||||||
|
**consumer-side structural Protocol cuts** locally inside the C10 module and lets the composition
|
||||||
|
root (`runtime_root.c10_factory`) wire C6's concrete strategies in via thin adapters.
|
||||||
|
|
||||||
|
This batch follows that precedent. `descriptor_batcher.py` declares four
|
||||||
|
local-to-C10 Protocols:
|
||||||
|
|
||||||
|
- `BackboneEmbedder` (lifted to `interface.py` for re-use by future tasks)
|
||||||
|
- `TilesByBboxBatchQuery` — narrower than C6's `TileMetadataStore.query_by_bbox`, accepts
|
||||||
|
`tuple[int, ...]` of zooms instead of a single zoom
|
||||||
|
- `TilePixelOpener` — narrower than C6's `TileStore.read_tile_pixels(TileId)`; takes
|
||||||
|
`(zoom, lat, lon)` and returns a context manager
|
||||||
|
- `DescriptorIndexRebuilder` — narrower than C6's
|
||||||
|
`DescriptorIndex.rebuild_from_descriptors(descriptors, tile_ids: list[TileId], hnsw_params: HnswParams)`;
|
||||||
|
takes `tile_records: list[TileBboxRecord]` plus individual HNSW kwargs
|
||||||
|
|
||||||
|
The matching adapters live in `runtime_root/c10_factory.py`:
|
||||||
|
|
||||||
|
- `c6_tile_metadata_store_to_tiles_batch_query` — loops over `zoom_levels`, projects `TileMetadata`
|
||||||
|
rows down to the four-field `TileBboxRecord`
|
||||||
|
- `c6_tile_store_to_pixel_opener` — builds `TileId` and returns the C6 `TilePixelHandle` (already
|
||||||
|
a context manager)
|
||||||
|
- `c6_descriptor_index_to_rebuilder` — projects `TileBboxRecord` → `TileId` and folds HNSW kwargs
|
||||||
|
into `HnswParams`
|
||||||
|
|
||||||
|
### 2. `C7EngineBackboneEmbedder` adapter — `Any`-typed at the c7 boundary
|
||||||
|
|
||||||
|
The default `BackboneEmbedder` impl wraps an AZ-297 `InferenceRuntime` + an AZ-321-compiled
|
||||||
|
`EngineHandle`. Importing those types — even under `TYPE_CHECKING` — fails the AZ-270 AST lint
|
||||||
|
because the lint walks `ast.ImportFrom` nodes regardless of context. We therefore type the
|
||||||
|
constructor parameters as `Any` and rely on structural duck-typing
|
||||||
|
(`inference_runtime.infer(handle, dict) -> dict`). The composition root wires the concrete C7
|
||||||
|
runtime in.
|
||||||
|
|
||||||
|
### 3. JPEG → tensor preprocessing is injected, not owned
|
||||||
|
|
||||||
|
`C7EngineBackboneEmbedder` accepts a `tile_decoder: Callable[[Any], np.ndarray]` rather than
|
||||||
|
hard-wiring OpenCV / Pillow / torchvision. Image preprocessing belongs to E-C2 (AZ-255); when
|
||||||
|
it ships, the composition root injects a real decoder. Until then the adapter stays free of
|
||||||
|
imaging-stack dependencies, keeping AZ-322's surface narrow and the test surface tiny.
|
||||||
|
|
||||||
|
### 4. Descriptor int64 id formula — reuse AZ-306, do not invent
|
||||||
|
|
||||||
|
`DescriptorBatcher` does NOT recompute the int64 id formula. It hands `TileBboxRecord` rows to
|
||||||
|
the rebuilder; the rebuilder adapter projects to `TileId`; AZ-306's
|
||||||
|
`FaissDescriptorIndex.rebuild_from_descriptors` uses the canonical
|
||||||
|
`tile_id_to_int64(TileId)` helper. Test `test_ac6_descriptor_id_mapping_matches_az306_scheme`
|
||||||
|
confirms by importing `tile_id_to_int64` directly and asserting against the
|
||||||
|
`int.from_bytes(sha256("zoom|lat|lon").first8, "big", signed=True)` formula.
|
||||||
|
|
||||||
|
## Files Changed
|
||||||
|
|
||||||
|
### Production code (new)
|
||||||
|
|
||||||
|
- `src/gps_denied_onboard/components/c10_provisioning/descriptor_batcher.py` — `DescriptorBatcher`
|
||||||
|
class + `BatcherTile`, `TileBboxRecord`, `CorpusFilter`, `ProgressEvent`, `DescriptorBatchReport`,
|
||||||
|
`BatcherOutcome`, `C10BatcherConfig` DTOs + `TilesByBboxBatchQuery`, `TilePixelOpener`,
|
||||||
|
`DescriptorIndexRebuilder` consumer Protocols.
|
||||||
|
- `src/gps_denied_onboard/components/c10_provisioning/c7_engine_embedder.py` —
|
||||||
|
`C7EngineBackboneEmbedder` adapter wrapping the AZ-297 `InferenceRuntime` surface; `Any`-typed
|
||||||
|
to stay below the AZ-270 boundary.
|
||||||
|
|
||||||
|
### Production code (modified)
|
||||||
|
|
||||||
|
- `src/gps_denied_onboard/components/c10_provisioning/interface.py` — added `BackboneEmbedder`
|
||||||
|
Protocol (`embed_batch` + `descriptor_dim`), `runtime_checkable`.
|
||||||
|
- `src/gps_denied_onboard/components/c10_provisioning/errors.py` — added `DescriptorBatchError`
|
||||||
|
exception class extending `C10ProvisioningError`.
|
||||||
|
- `src/gps_denied_onboard/components/c10_provisioning/__init__.py` — re-exported all new symbols.
|
||||||
|
- `src/gps_denied_onboard/runtime_root/c10_factory.py` — added `build_descriptor_batcher` plus
|
||||||
|
the three C6→C10 adapter functions.
|
||||||
|
|
||||||
|
### Tests (new)
|
||||||
|
|
||||||
|
- `tests/unit/c10_provisioning/test_descriptor_batcher.py` — 16 tests covering AC-1 through
|
||||||
|
AC-10 + NFR-perf-overhead + NFR-reliability-bounded-retry, plus 4 supplemental tests
|
||||||
|
(`Protocol` runtime-check for the four consumer cuts, query-args pass-through, handle
|
||||||
|
release on embed failure, config validation).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
- `_docs/02_document/module-layout.md` — c10 Public API + Internal section updated to list the
|
||||||
|
AZ-322 surface; composition root section lists the new factory + adapters.
|
||||||
|
- `_docs/02_document/components/11_c10_provisioning/description.md` — §5 dependency table picks
|
||||||
|
up `numpy`; new "AZ-322 internal phase" subsection summarises the batcher's
|
||||||
|
contract / OOM behaviour / progress reporting / id formula.
|
||||||
|
|
||||||
|
## Test Results
|
||||||
|
|
||||||
|
- 16 / 16 AZ-322 tests pass (`tests/unit/c10_provisioning/test_descriptor_batcher.py`).
|
||||||
|
- 197 / 197 c10 + c6 + runtime-root targeted runs pass (59 docker-skip).
|
||||||
|
- Full project suite: **1352 passed, 79 skipped, 1 failed**.
|
||||||
|
- 79 skipped: docker / Jetson / CUDA / actionlint env-gated (Tier-0 dev host).
|
||||||
|
- 1 failed: `tests/unit/test_ac1_scaffold_layout.py::test_cmake_files_configure` —
|
||||||
|
pre-existing OKVIS2 git-submodule failure documented in batch_35 cycle report; unrelated
|
||||||
|
to this batch.
|
||||||
|
|
||||||
|
## Decisions Ledger
|
||||||
|
|
||||||
|
| Decision | Rationale |
|
||||||
|
|----------|-----------|
|
||||||
|
| `DescriptorBatcher.__init__` takes consumer-side Protocols, not raw C6 types | AZ-270 lint blocks direct cross-component imports; AZ-323 / AZ-324 set the precedent |
|
||||||
|
| `C7EngineBackboneEmbedder` parameters are `Any`-typed | AZ-270 AST lint flags `TYPE_CHECKING` imports too; structural duck-typing avoids the boundary |
|
||||||
|
| `tile_decoder` is injected, not bundled | JPEG preprocessing belongs to E-C2 (AZ-255); keeping it out of AZ-322 narrows scope and dependencies |
|
||||||
|
| Default `C10BatcherConfig.max_oom_retries=1` | Spec NFR-reliability-bounded-retry; one halve from 64 → 32 is the standard surface, deeper retries mask GPU regressions |
|
||||||
|
| Reuse AZ-306's `tile_id_to_int64` | Spec AC-6; inventing the formula here would diverge from C6's id scheme |
|
||||||
|
| Atomic FAISS rebuild guaranteed by AZ-306, not duplicated here | Spec AC-7; the batcher's role is to call `rebuild_from_descriptors` exactly once |
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The `C7EngineBackboneEmbedder` is the default `BackboneEmbedder` impl, but production wiring
|
||||||
|
to a real C7 engine awaits AZ-326 (T5 orchestrator) and AZ-255 (real C2 backbone preprocessing).
|
||||||
|
The adapter is unit-tested via fakes today; integration tests land with AZ-326.
|
||||||
|
- `C10BatcherConfig` currently has no dedicated config-block hook in
|
||||||
|
`C10ProvisioningConfig`; `build_descriptor_batcher` uses defaults. AZ-326 will add the
|
||||||
|
config-block plumbing.
|
||||||
|
- The OKVIS2 cmake submodule failure remains and is independent of every batch-35 / batch-36
|
||||||
|
change. It will resolve when the project's submodules are initialised on the dev host.
|
||||||
@@ -6,11 +6,11 @@ step: 7
|
|||||||
name: Implement
|
name: Implement
|
||||||
status: in_progress
|
status: in_progress
|
||||||
sub_step:
|
sub_step:
|
||||||
phase: 3
|
phase: 4
|
||||||
name: compute-next-batch
|
name: batch-complete
|
||||||
detail: "batch 35 complete (AZ-306 5pt; faiss-cpu PyPI strategy chosen over custom pybind11 wrapper); awaiting next batch selection"
|
detail: "batch 36 complete: AZ-322 implemented + tests + factory wiring; ready to chain to next batch"
|
||||||
retry_count: 0
|
retry_count: 0
|
||||||
cycle: 1
|
cycle: 1
|
||||||
tracker: jira
|
tracker: jira
|
||||||
last_completed_batch: 35
|
last_completed_batch: 36
|
||||||
last_cumulative_review: batches_31-33
|
last_cumulative_review: batches_31-33
|
||||||
|
|||||||
@@ -11,12 +11,27 @@ them through this single contract surface.
|
|||||||
|
|
||||||
from gps_denied_onboard._types.inference import EngineCacheEntry
|
from gps_denied_onboard._types.inference import EngineCacheEntry
|
||||||
from gps_denied_onboard._types.manifests import Manifest
|
from gps_denied_onboard._types.manifests import Manifest
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.c7_engine_embedder import (
|
||||||
|
C7EngineBackboneEmbedder,
|
||||||
|
)
|
||||||
from gps_denied_onboard.components.c10_provisioning.config import (
|
from gps_denied_onboard.components.c10_provisioning.config import (
|
||||||
BackboneConfig,
|
BackboneConfig,
|
||||||
C10ManifestConfig,
|
C10ManifestConfig,
|
||||||
C10ProvisioningConfig,
|
C10ProvisioningConfig,
|
||||||
SigningMode,
|
SigningMode,
|
||||||
)
|
)
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.descriptor_batcher import (
|
||||||
|
BatcherTile,
|
||||||
|
C10BatcherConfig,
|
||||||
|
CorpusFilter,
|
||||||
|
DescriptorBatcher,
|
||||||
|
DescriptorBatchReport,
|
||||||
|
DescriptorIndexRebuilder,
|
||||||
|
ProgressEvent,
|
||||||
|
TileBboxRecord,
|
||||||
|
TilePixelOpener,
|
||||||
|
TilesByBboxBatchQuery,
|
||||||
|
)
|
||||||
from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
|
from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
|
||||||
BackboneSpec,
|
BackboneSpec,
|
||||||
CompileEngineCallable,
|
CompileEngineCallable,
|
||||||
@@ -28,9 +43,11 @@ from gps_denied_onboard.components.c10_provisioning.engine_compiler import (
|
|||||||
)
|
)
|
||||||
from gps_denied_onboard.components.c10_provisioning.errors import (
|
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||||
C10ProvisioningError,
|
C10ProvisioningError,
|
||||||
|
DescriptorBatchError,
|
||||||
ManifestWriteError,
|
ManifestWriteError,
|
||||||
)
|
)
|
||||||
from gps_denied_onboard.components.c10_provisioning.interface import (
|
from gps_denied_onboard.components.c10_provisioning.interface import (
|
||||||
|
BackboneEmbedder,
|
||||||
CacheProvisioner,
|
CacheProvisioner,
|
||||||
ManifestSigner,
|
ManifestSigner,
|
||||||
SigningKeyHandle,
|
SigningKeyHandle,
|
||||||
@@ -60,13 +77,22 @@ __all__ = [
|
|||||||
"VALID_SECTOR_CLASSES",
|
"VALID_SECTOR_CLASSES",
|
||||||
"ArtifactCheck",
|
"ArtifactCheck",
|
||||||
"BackboneConfig",
|
"BackboneConfig",
|
||||||
|
"BackboneEmbedder",
|
||||||
"BackboneSpec",
|
"BackboneSpec",
|
||||||
|
"BatcherTile",
|
||||||
|
"C7EngineBackboneEmbedder",
|
||||||
|
"C10BatcherConfig",
|
||||||
"C10ManifestConfig",
|
"C10ManifestConfig",
|
||||||
"C10ProvisioningConfig",
|
"C10ProvisioningConfig",
|
||||||
"C10ProvisioningError",
|
"C10ProvisioningError",
|
||||||
"CacheProvisioner",
|
"CacheProvisioner",
|
||||||
"CompileEngineCallable",
|
"CompileEngineCallable",
|
||||||
"CompileOutcome",
|
"CompileOutcome",
|
||||||
|
"CorpusFilter",
|
||||||
|
"DescriptorBatchError",
|
||||||
|
"DescriptorBatchReport",
|
||||||
|
"DescriptorBatcher",
|
||||||
|
"DescriptorIndexRebuilder",
|
||||||
"Ed25519ManifestSigner",
|
"Ed25519ManifestSigner",
|
||||||
"EngineCacheEntry",
|
"EngineCacheEntry",
|
||||||
"EngineCompileRequest",
|
"EngineCompileRequest",
|
||||||
@@ -81,9 +107,13 @@ __all__ = [
|
|||||||
"ManifestVerifier",
|
"ManifestVerifier",
|
||||||
"ManifestVerifierImpl",
|
"ManifestVerifierImpl",
|
||||||
"ManifestWriteError",
|
"ManifestWriteError",
|
||||||
|
"ProgressEvent",
|
||||||
"SigningKeyHandle",
|
"SigningKeyHandle",
|
||||||
"SigningMode",
|
"SigningMode",
|
||||||
|
"TileBboxRecord",
|
||||||
"TileHashRecord",
|
"TileHashRecord",
|
||||||
|
"TilePixelOpener",
|
||||||
|
"TilesByBboxBatchQuery",
|
||||||
"TilesByBboxQuery",
|
"TilesByBboxQuery",
|
||||||
"VerificationResult",
|
"VerificationResult",
|
||||||
"VerifyFailReason",
|
"VerifyFailReason",
|
||||||
|
|||||||
@@ -0,0 +1,150 @@
|
|||||||
|
"""``C7EngineBackboneEmbedder`` (AZ-322).
|
||||||
|
|
||||||
|
Default :class:`BackboneEmbedder` implementation: wraps an AZ-321-produced
|
||||||
|
engine + an AZ-297 :class:`InferenceRuntime` and turns
|
||||||
|
``list[TilePixelHandle]`` into ``np.ndarray`` of shape
|
||||||
|
``(batch_size, descriptor_dim)``.
|
||||||
|
|
||||||
|
JPEG → tensor preprocessing is **not** owned here — it is the
|
||||||
|
:class:`BackboneSpec` consumer's responsibility (the C2 VPR backbone in
|
||||||
|
AZ-255 will eventually own its own normalization). Until E-C2 ships,
|
||||||
|
the composition root injects a ``tile_decoder`` callable so this
|
||||||
|
adapter stays free of OpenCV / Pillow / torchvision imports and the
|
||||||
|
test surface stays narrow. Risk-1 in the AZ-322 spec mitigation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections.abc import Callable
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||||
|
DescriptorBatchError,
|
||||||
|
)
|
||||||
|
|
||||||
|
# AZ-322: ``InferenceRuntime`` (c7) and ``EngineHandle`` (_types) are
|
||||||
|
# REFERENCED only in annotations. Importing them at runtime — even
|
||||||
|
# under ``TYPE_CHECKING`` — would cross the AZ-270 component boundary
|
||||||
|
# (the AST lint flags TYPE_CHECKING imports too, conservatively). We
|
||||||
|
# instead duck-type these via ``Any`` and rely on structural calls
|
||||||
|
# (``inference_runtime.infer(handle, dict) -> dict``); the
|
||||||
|
# composition root (``runtime_root.c10_factory``) wires the concrete
|
||||||
|
# c7 instance in.
|
||||||
|
|
||||||
|
__all__ = ["C7EngineBackboneEmbedder"]
|
||||||
|
|
||||||
|
|
||||||
|
_OOM_MARKERS = ("CUDA out of memory", "OutOfMemoryError", "OOM")
|
||||||
|
|
||||||
|
|
||||||
|
class C7EngineBackboneEmbedder:
|
||||||
|
"""Thin adapter from AZ-297's :class:`InferenceRuntime` to
|
||||||
|
:class:`BackboneEmbedder`.
|
||||||
|
|
||||||
|
Construction owns one :class:`EngineHandle` for the lifetime of
|
||||||
|
the embedder (one batcher session). ``embed_batch`` decodes the
|
||||||
|
incoming tile handles via the injected ``tile_decoder`` callable,
|
||||||
|
stacks them into a batch tensor, and runs ``infer`` once.
|
||||||
|
|
||||||
|
The output tensor is read from ``outputs[output_name]`` —
|
||||||
|
exposing ``output_name`` keeps the adapter portable across
|
||||||
|
backbones (DINOv2-VPR uses ``"descriptor"``, others may differ).
|
||||||
|
|
||||||
|
OOM rewrap: any :class:`gps_denied_onboard.components.c7_inference.errors.OutOfMemoryError`
|
||||||
|
OR an exception whose ``str`` contains an OOM marker is rewrapped
|
||||||
|
as :class:`DescriptorBatchError("CUDA OOM at batch_size=N")` so
|
||||||
|
:class:`DescriptorBatcher`'s halve-and-retry catches it (AC-2).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
inference_runtime: Any,
|
||||||
|
engine_handle: Any,
|
||||||
|
input_name: str,
|
||||||
|
output_name: str,
|
||||||
|
descriptor_dim: int,
|
||||||
|
tile_decoder: Callable[[Any], np.ndarray],
|
||||||
|
logger: logging.Logger,
|
||||||
|
) -> None:
|
||||||
|
if descriptor_dim <= 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"descriptor_dim must be positive; got {descriptor_dim}"
|
||||||
|
)
|
||||||
|
self._runtime = inference_runtime
|
||||||
|
self._handle = engine_handle
|
||||||
|
self._input_name = input_name
|
||||||
|
self._output_name = output_name
|
||||||
|
self._descriptor_dim = descriptor_dim
|
||||||
|
self._tile_decoder = tile_decoder
|
||||||
|
self._logger = logger
|
||||||
|
|
||||||
|
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||||
|
if not tiles:
|
||||||
|
return np.empty((0, self._descriptor_dim), dtype=np.float32)
|
||||||
|
|
||||||
|
batch = self._stack_batch(tiles)
|
||||||
|
try:
|
||||||
|
outputs = self._runtime.infer(
|
||||||
|
self._handle, {self._input_name: batch}
|
||||||
|
)
|
||||||
|
except Exception as exc: # rewrap OOM; surface everything else
|
||||||
|
if _looks_like_oom(exc):
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
f"CUDA OOM at batch_size={len(tiles)}"
|
||||||
|
) from exc
|
||||||
|
raise
|
||||||
|
|
||||||
|
if self._output_name not in outputs:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
f"engine output dict missing key {self._output_name!r}; "
|
||||||
|
f"available keys = {list(outputs.keys())}"
|
||||||
|
)
|
||||||
|
descriptors = outputs[self._output_name]
|
||||||
|
if not isinstance(descriptors, np.ndarray):
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
f"engine output {self._output_name!r} is not an ndarray; "
|
||||||
|
f"got {type(descriptors).__name__}"
|
||||||
|
)
|
||||||
|
if descriptors.ndim != 2 or descriptors.shape[0] != len(tiles):
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
f"engine output shape {descriptors.shape} does not match "
|
||||||
|
f"expected (batch={len(tiles)}, dim={self._descriptor_dim})"
|
||||||
|
)
|
||||||
|
if descriptors.dtype != np.float32:
|
||||||
|
descriptors = descriptors.astype(np.float32, copy=False)
|
||||||
|
return descriptors
|
||||||
|
|
||||||
|
def descriptor_dim(self) -> int:
|
||||||
|
return self._descriptor_dim
|
||||||
|
|
||||||
|
def _stack_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||||
|
decoded = [self._tile_decoder(handle) for handle in tiles]
|
||||||
|
if not decoded:
|
||||||
|
return np.empty((0,), dtype=np.float32)
|
||||||
|
first_shape = decoded[0].shape
|
||||||
|
for i, arr in enumerate(decoded[1:], start=1):
|
||||||
|
if arr.shape != first_shape:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
f"tile_decoder returned shape mismatch at index {i}: "
|
||||||
|
f"{arr.shape} vs first {first_shape}"
|
||||||
|
)
|
||||||
|
return np.stack(decoded, axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_oom(exc: BaseException) -> bool:
|
||||||
|
"""Detect OOM by exception type name OR message marker.
|
||||||
|
|
||||||
|
The C7 contract names the canonical exception
|
||||||
|
:class:`OutOfMemoryError`; back-end SDKs occasionally raise raw
|
||||||
|
:class:`RuntimeError` with a message describing OOM (PyTorch
|
||||||
|
historically does this). We accept both so the AC-2 retry loop
|
||||||
|
catches the failure regardless of the underlying SDK.
|
||||||
|
"""
|
||||||
|
if type(exc).__name__ == "OutOfMemoryError":
|
||||||
|
return True
|
||||||
|
message = str(exc)
|
||||||
|
return any(marker in message for marker in _OOM_MARKERS)
|
||||||
@@ -0,0 +1,522 @@
|
|||||||
|
"""C10 ``DescriptorBatcher`` — embed corpus + rebuild FAISS index (AZ-322).
|
||||||
|
|
||||||
|
The pre-flight phase that walks every C6 tile in
|
||||||
|
``(bbox, zoom_levels)``, runs them through the C2 backbone (via the
|
||||||
|
AZ-321-produced engine) in batches sized for the operator workstation,
|
||||||
|
and rebuilds the FAISS HNSW index via AZ-303 / AZ-306's
|
||||||
|
:meth:`DescriptorIndex.rebuild_from_descriptors`.
|
||||||
|
|
||||||
|
Cross-component DTOs travel through consumer-side structural Protocol
|
||||||
|
cuts living in this module — :class:`TilesByBboxBatchQuery`,
|
||||||
|
:class:`TilePixelOpener`, :class:`DescriptorIndexRebuilder`,
|
||||||
|
:class:`BatcherTile` — so the AZ-270 lint
|
||||||
|
(``test_az270_compose_root.test_ac6``) stays green: this module never
|
||||||
|
imports ``components.c6_tile_cache`` directly. The composition root
|
||||||
|
adapts the real C6 surface inside
|
||||||
|
``runtime_root.c10_factory.build_descriptor_batcher``.
|
||||||
|
|
||||||
|
Design constraints baked in by the spec:
|
||||||
|
|
||||||
|
- Halve-and-retry on CUDA OOM is bounded by ``max_oom_retries``; default
|
||||||
|
1 (so 64 → 32 → fail-fast).
|
||||||
|
- ``rebuild_from_descriptors`` is the ONLY write path to the ``.index``
|
||||||
|
file (no raw ``numpy.tofile`` — AZ-306 owns atomicity).
|
||||||
|
- The int64 id formula is canonical via AZ-306 (the C6 helper is
|
||||||
|
re-imported through the composition-root adapter so this module never
|
||||||
|
reaches across the AZ-270 boundary).
|
||||||
|
- ``embed_batch`` is called with mmap-backed handles, not raw bytes —
|
||||||
|
preserves AZ-303's read-only invariant on tile pixels.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections.abc import Callable
|
||||||
|
from contextlib import ExitStack
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from gps_denied_onboard.clock import Clock
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.errors import (
|
||||||
|
DescriptorBatchError,
|
||||||
|
)
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.interface import (
|
||||||
|
BackboneEmbedder,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BatcherTile",
|
||||||
|
"C10BatcherConfig",
|
||||||
|
"CorpusFilter",
|
||||||
|
"DescriptorBatchReport",
|
||||||
|
"DescriptorBatcher",
|
||||||
|
"DescriptorIndexRebuilder",
|
||||||
|
"ProgressEvent",
|
||||||
|
"TileBboxRecord",
|
||||||
|
"TilePixelOpener",
|
||||||
|
"TilesByBboxBatchQuery",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_LOG_KIND_PREFIX = "c10.descriptor"
|
||||||
|
_PROGRESS_LOG_FRACTION = 10 # emit one DEBUG log per ~10% of tiles
|
||||||
|
_PROGRESS_CALLBACK_FRACTION = 10 # callback fires every 10%
|
||||||
|
|
||||||
|
|
||||||
|
class BatcherOutcome(str, Enum):
|
||||||
|
"""Terminal classification of a :class:`DescriptorBatcher` run."""
|
||||||
|
|
||||||
|
SUCCESS = "success"
|
||||||
|
FAILURE = "failure"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TileBboxRecord:
|
||||||
|
"""Consumer-side DTO returned by :class:`TilesByBboxBatchQuery`.
|
||||||
|
|
||||||
|
The composition-root adapter walks C6's ``TileMetadata`` rows and
|
||||||
|
emits one of these per tile so this module never imports the C6
|
||||||
|
DTO directly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
zoom: int
|
||||||
|
lat: float
|
||||||
|
lon: float
|
||||||
|
source: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CorpusFilter:
|
||||||
|
"""Filter set passed into :meth:`DescriptorBatcher.populate_descriptors`.
|
||||||
|
|
||||||
|
Mirrors the spec's three-axis filter: spatial (``bbox`` —
|
||||||
|
``(min_lat, min_lon, max_lat, max_lon)``), zoom (a tuple so the
|
||||||
|
operator can ask for multiple levels in one go), and sector
|
||||||
|
(``"active_conflict"`` or ``"stable_rear"`` — string form keeps this
|
||||||
|
DTO free of the C6 enum import).
|
||||||
|
"""
|
||||||
|
|
||||||
|
bbox: tuple[float, float, float, float]
|
||||||
|
zoom_levels: tuple[int, ...]
|
||||||
|
sector_class: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProgressEvent:
|
||||||
|
"""One progress update emitted to ``C10BatcherConfig.progress_callback``."""
|
||||||
|
|
||||||
|
tiles_done: int
|
||||||
|
tiles_total: int
|
||||||
|
current_batch_size: int
|
||||||
|
elapsed_s: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DescriptorBatchReport:
|
||||||
|
"""Terminal report returned by :meth:`populate_descriptors`."""
|
||||||
|
|
||||||
|
descriptors_generated: int
|
||||||
|
tiles_consumed: int
|
||||||
|
oom_retries: int
|
||||||
|
elapsed_s: float
|
||||||
|
outcome: BatcherOutcome
|
||||||
|
failure_reason: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class C10BatcherConfig:
|
||||||
|
"""Per-instance batcher policy.
|
||||||
|
|
||||||
|
``initial_batch_size`` defaults to 64 (the spec's worked example);
|
||||||
|
operators on smaller GPUs dial it down via YAML.
|
||||||
|
|
||||||
|
``max_oom_retries`` bounds halve-and-retry; default 1 keeps the
|
||||||
|
failure surface at "64 → 32 → fail" so a real GPU regression is
|
||||||
|
visible after ~5-10 s rather than a multi-minute timeout.
|
||||||
|
|
||||||
|
``progress_callback`` is optional; the batcher always emits DEBUG
|
||||||
|
logs at the same cadence so operators staring at a CLI see
|
||||||
|
progress without enabling tracing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
initial_batch_size: int = 64
|
||||||
|
max_oom_retries: int = 1
|
||||||
|
progress_callback: Callable[[ProgressEvent], None] | None = None
|
||||||
|
hnsw_m: int = 32
|
||||||
|
hnsw_ef_construction: int = 200
|
||||||
|
hnsw_ef_search: int = 64
|
||||||
|
hnsw_metric: str = "L2"
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if self.initial_batch_size <= 0:
|
||||||
|
raise ValueError(
|
||||||
|
"C10BatcherConfig.initial_batch_size must be > 0; "
|
||||||
|
f"got {self.initial_batch_size}"
|
||||||
|
)
|
||||||
|
if self.max_oom_retries < 0:
|
||||||
|
raise ValueError(
|
||||||
|
"C10BatcherConfig.max_oom_retries must be >= 0; "
|
||||||
|
f"got {self.max_oom_retries}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TilesByBboxBatchQuery(Protocol):
|
||||||
|
"""Consumer-side cut over C6's ``TileMetadataStore.query_by_bbox``.
|
||||||
|
|
||||||
|
AZ-322 needs the spatial+sector filter applied across multiple
|
||||||
|
zoom levels in one logical call; the composition-root adapter
|
||||||
|
(``runtime_root.c10_factory.build_descriptor_batcher``) loops over
|
||||||
|
``zoom_levels`` and yields one :class:`TileBboxRecord` per
|
||||||
|
matching row. Returns the concatenated list in deterministic order
|
||||||
|
so the int64 id mapping is stable across runs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def query_by_bbox_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bbox: tuple[float, float, float, float],
|
||||||
|
zoom_levels: tuple[int, ...],
|
||||||
|
sector_class: str,
|
||||||
|
) -> list[TileBboxRecord]: ...
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class BatcherTile:
|
||||||
|
"""Bundles a tile's identity with its mmap handle for one embed call."""
|
||||||
|
|
||||||
|
record: TileBboxRecord
|
||||||
|
pixel_handle: Any # ``TilePixelHandle`` from the C6 ABC; opaque here.
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class TilePixelOpener(Protocol):
|
||||||
|
"""Consumer-side cut over C6's ``TileStore.read_tile_pixels``.
|
||||||
|
|
||||||
|
Returns the C6 ``TilePixelHandle`` ABC (a context manager). The
|
||||||
|
composition-root adapter wraps the real C6 ``TileStore`` and
|
||||||
|
re-creates a ``TileId`` from the ``(zoom, lat, lon)`` triple before
|
||||||
|
delegating.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any: ...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class DescriptorIndexRebuilder(Protocol):
|
||||||
|
"""Consumer-side cut over AZ-303 / AZ-306 ``rebuild_from_descriptors``.
|
||||||
|
|
||||||
|
The composition root passes the real
|
||||||
|
:class:`gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index.FaissDescriptorIndex`
|
||||||
|
instance — it structurally satisfies this Protocol. Tests pass a
|
||||||
|
spy.
|
||||||
|
|
||||||
|
AZ-322 hands ``tile_records`` (one per row of ``descriptors``)
|
||||||
|
rather than pre-computed int64 ids: AZ-306 owns the canonical
|
||||||
|
``tile_id_to_int64`` mapping (same rule as AC-6); we don't
|
||||||
|
duplicate the formula here. The composition-root adapter
|
||||||
|
constructs the C6 ``TileId`` instances + supplies them to
|
||||||
|
``rebuild_from_descriptors`` (which itself runs the int64
|
||||||
|
derivation per AZ-306).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def rebuild(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
descriptors: np.ndarray,
|
||||||
|
tile_records: list[TileBboxRecord],
|
||||||
|
hnsw_m: int,
|
||||||
|
hnsw_ef_construction: int,
|
||||||
|
hnsw_ef_search: int,
|
||||||
|
hnsw_metric: str,
|
||||||
|
) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
|
class DescriptorBatcher:
|
||||||
|
"""Pre-flight descriptor-batched generation + FAISS rebuild.
|
||||||
|
|
||||||
|
Single public method: :meth:`populate_descriptors`. Construction
|
||||||
|
is dependency-injection only; the composition root supplies all
|
||||||
|
four collaborators and the config block.
|
||||||
|
|
||||||
|
Not re-entrant — concurrent calls on the same instance break the
|
||||||
|
progress accounting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
backbone_embedder: BackboneEmbedder,
|
||||||
|
tiles_query: TilesByBboxBatchQuery,
|
||||||
|
tile_pixel_opener: TilePixelOpener,
|
||||||
|
descriptor_index: DescriptorIndexRebuilder,
|
||||||
|
clock: Clock,
|
||||||
|
logger: logging.Logger,
|
||||||
|
config: C10BatcherConfig,
|
||||||
|
) -> None:
|
||||||
|
self._embedder = backbone_embedder
|
||||||
|
self._tiles_query = tiles_query
|
||||||
|
self._tile_opener = tile_pixel_opener
|
||||||
|
self._descriptor_index = descriptor_index
|
||||||
|
self._clock = clock
|
||||||
|
self._logger = logger
|
||||||
|
self._config = config
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Public surface
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def populate_descriptors(self, corpus_filter: CorpusFilter) -> DescriptorBatchReport:
|
||||||
|
run_started_ns = self._clock.monotonic_ns()
|
||||||
|
|
||||||
|
records = self._tiles_query.query_by_bbox_batch(
|
||||||
|
bbox=corpus_filter.bbox,
|
||||||
|
zoom_levels=corpus_filter.zoom_levels,
|
||||||
|
sector_class=corpus_filter.sector_class,
|
||||||
|
)
|
||||||
|
if not records:
|
||||||
|
elapsed_s = self._elapsed_s(run_started_ns)
|
||||||
|
reason = (
|
||||||
|
"no tiles in C6 for the requested scope; run C11 "
|
||||||
|
"TileDownloader first"
|
||||||
|
)
|
||||||
|
self._logger.error(
|
||||||
|
f"{_LOG_KIND_PREFIX}.empty.corpus",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.empty.corpus",
|
||||||
|
"bbox": corpus_filter.bbox,
|
||||||
|
"zoom_levels": corpus_filter.zoom_levels,
|
||||||
|
"sector_class": corpus_filter.sector_class,
|
||||||
|
"elapsed_s": elapsed_s,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return DescriptorBatchReport(
|
||||||
|
descriptors_generated=0,
|
||||||
|
tiles_consumed=0,
|
||||||
|
oom_retries=0,
|
||||||
|
elapsed_s=elapsed_s,
|
||||||
|
outcome=BatcherOutcome.FAILURE,
|
||||||
|
failure_reason=reason,
|
||||||
|
)
|
||||||
|
|
||||||
|
descriptor_dim = int(self._embedder.descriptor_dim())
|
||||||
|
if descriptor_dim <= 0:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher.populate_descriptors: backbone_embedder."
|
||||||
|
f"descriptor_dim() must be > 0; got {descriptor_dim}"
|
||||||
|
)
|
||||||
|
|
||||||
|
total = len(records)
|
||||||
|
self._logger.info(
|
||||||
|
f"{_LOG_KIND_PREFIX}.session.start",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.session.start",
|
||||||
|
"tiles_total": total,
|
||||||
|
"initial_batch_size": self._config.initial_batch_size,
|
||||||
|
"descriptor_dim": descriptor_dim,
|
||||||
|
"bbox": corpus_filter.bbox,
|
||||||
|
"zoom_levels": corpus_filter.zoom_levels,
|
||||||
|
"sector_class": corpus_filter.sector_class,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
descriptor_buffer: list[np.ndarray] = []
|
||||||
|
consumed_records: list[TileBboxRecord] = []
|
||||||
|
current_batch_size = self._config.initial_batch_size
|
||||||
|
oom_retries = 0
|
||||||
|
last_progress_emit = 0
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
while idx < total:
|
||||||
|
window = records[idx : idx + current_batch_size]
|
||||||
|
try:
|
||||||
|
batch_descriptors = self._embed_one_window(window, descriptor_dim)
|
||||||
|
except DescriptorBatchError as exc:
|
||||||
|
if (
|
||||||
|
self._is_oom(exc)
|
||||||
|
and oom_retries < self._config.max_oom_retries
|
||||||
|
and current_batch_size > 1
|
||||||
|
):
|
||||||
|
new_size = max(1, current_batch_size // 2)
|
||||||
|
self._logger.warning(
|
||||||
|
f"{_LOG_KIND_PREFIX}.oom.retry",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.oom.retry",
|
||||||
|
"previous_batch_size": current_batch_size,
|
||||||
|
"new_batch_size": new_size,
|
||||||
|
"tiles_done": idx,
|
||||||
|
"tiles_total": total,
|
||||||
|
"oom_retries_after": oom_retries + 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
current_batch_size = new_size
|
||||||
|
oom_retries += 1
|
||||||
|
continue
|
||||||
|
self._logger.error(
|
||||||
|
f"{_LOG_KIND_PREFIX}.oom.terminal",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.oom.terminal",
|
||||||
|
"batch_size": current_batch_size,
|
||||||
|
"tiles_done": idx,
|
||||||
|
"tiles_total": total,
|
||||||
|
"oom_retries": oom_retries,
|
||||||
|
"first_tile_in_batch": (
|
||||||
|
window[0].zoom,
|
||||||
|
window[0].lat,
|
||||||
|
window[0].lon,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
descriptor_buffer.append(batch_descriptors)
|
||||||
|
consumed_records.extend(window)
|
||||||
|
idx += len(window)
|
||||||
|
|
||||||
|
elapsed_s = self._elapsed_s(run_started_ns)
|
||||||
|
self._maybe_emit_progress(
|
||||||
|
tiles_done=idx,
|
||||||
|
tiles_total=total,
|
||||||
|
current_batch_size=current_batch_size,
|
||||||
|
elapsed_s=elapsed_s,
|
||||||
|
last_emit_ref=last_progress_emit,
|
||||||
|
)
|
||||||
|
last_progress_emit = (idx * _PROGRESS_CALLBACK_FRACTION) // total
|
||||||
|
|
||||||
|
descriptors = np.concatenate(descriptor_buffer, axis=0)
|
||||||
|
if descriptors.shape != (total, descriptor_dim):
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher.populate_descriptors: descriptor matrix has "
|
||||||
|
f"shape {descriptors.shape}, expected ({total}, {descriptor_dim})"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._descriptor_index.rebuild(
|
||||||
|
descriptors=np.ascontiguousarray(descriptors, dtype=np.float32),
|
||||||
|
tile_records=consumed_records,
|
||||||
|
hnsw_m=self._config.hnsw_m,
|
||||||
|
hnsw_ef_construction=self._config.hnsw_ef_construction,
|
||||||
|
hnsw_ef_search=self._config.hnsw_ef_search,
|
||||||
|
hnsw_metric=self._config.hnsw_metric,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed_s = self._elapsed_s(run_started_ns)
|
||||||
|
self._logger.info(
|
||||||
|
f"{_LOG_KIND_PREFIX}.session.complete",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.session.complete",
|
||||||
|
"tiles_total": total,
|
||||||
|
"descriptors_generated": total,
|
||||||
|
"oom_retries": oom_retries,
|
||||||
|
"elapsed_s": elapsed_s,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return DescriptorBatchReport(
|
||||||
|
descriptors_generated=total,
|
||||||
|
tiles_consumed=total,
|
||||||
|
oom_retries=oom_retries,
|
||||||
|
elapsed_s=elapsed_s,
|
||||||
|
outcome=BatcherOutcome.SUCCESS,
|
||||||
|
failure_reason=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Internals
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _embed_one_window(
|
||||||
|
self, window: list[TileBboxRecord], descriptor_dim: int
|
||||||
|
) -> np.ndarray:
|
||||||
|
with ExitStack() as stack:
|
||||||
|
handles: list[Any] = []
|
||||||
|
for record in window:
|
||||||
|
handle = self._tile_opener.open_tile(
|
||||||
|
zoom=record.zoom, lat=record.lat, lon=record.lon
|
||||||
|
)
|
||||||
|
# The C6 ``TilePixelHandle`` ABC is a context manager
|
||||||
|
# whose __exit__ unmaps the file. ExitStack guarantees
|
||||||
|
# release even if embed_batch raises mid-flight.
|
||||||
|
stack.enter_context(handle)
|
||||||
|
handles.append(handle)
|
||||||
|
batch_descriptors = self._embedder.embed_batch(handles)
|
||||||
|
|
||||||
|
return self._validate_batch_shape(
|
||||||
|
batch_descriptors, expected_rows=len(window), descriptor_dim=descriptor_dim
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _validate_batch_shape(
|
||||||
|
batch_descriptors: object,
|
||||||
|
*,
|
||||||
|
expected_rows: int,
|
||||||
|
descriptor_dim: int,
|
||||||
|
) -> np.ndarray:
|
||||||
|
if not isinstance(batch_descriptors, np.ndarray):
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher: embed_batch must return numpy.ndarray; "
|
||||||
|
f"got {type(batch_descriptors).__name__}"
|
||||||
|
)
|
||||||
|
if batch_descriptors.ndim != 2:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher: embed_batch must return a 2-D array; "
|
||||||
|
f"got ndim={batch_descriptors.ndim}"
|
||||||
|
)
|
||||||
|
if batch_descriptors.shape[0] != expected_rows:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher: embed_batch returned "
|
||||||
|
f"{batch_descriptors.shape[0]} rows; expected {expected_rows}"
|
||||||
|
)
|
||||||
|
if batch_descriptors.shape[1] != descriptor_dim:
|
||||||
|
raise DescriptorBatchError(
|
||||||
|
"DescriptorBatcher: descriptor_dim mismatch — embed_batch "
|
||||||
|
f"returned shape[1]={batch_descriptors.shape[1]}, but "
|
||||||
|
f"descriptor_dim()={descriptor_dim}"
|
||||||
|
)
|
||||||
|
if batch_descriptors.dtype != np.float32:
|
||||||
|
return batch_descriptors.astype(np.float32, copy=False)
|
||||||
|
return batch_descriptors
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_oom(exc: DescriptorBatchError) -> bool:
|
||||||
|
# Spec AC-2 distinguishes OOM by message substring; no separate
|
||||||
|
# exception subclass — keeps the impl free of CUDA-specific imports.
|
||||||
|
return "CUDA OOM" in str(exc)
|
||||||
|
|
||||||
|
def _elapsed_s(self, run_started_ns: int) -> float:
|
||||||
|
return max(0.0, (self._clock.monotonic_ns() - run_started_ns) / 1e9)
|
||||||
|
|
||||||
|
def _maybe_emit_progress(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
tiles_done: int,
|
||||||
|
tiles_total: int,
|
||||||
|
current_batch_size: int,
|
||||||
|
elapsed_s: float,
|
||||||
|
last_emit_ref: int,
|
||||||
|
) -> None:
|
||||||
|
# Fire at every 10% boundary crossed since last emit (so the
|
||||||
|
# callback receives exactly 10 events for 1000 tiles, even when
|
||||||
|
# batch sizes don't divide evenly into 100 tiles per step).
|
||||||
|
current_decile = (tiles_done * _PROGRESS_CALLBACK_FRACTION) // tiles_total
|
||||||
|
for decile in range(last_emit_ref + 1, current_decile + 1):
|
||||||
|
tiles_at_decile = (decile * tiles_total) // _PROGRESS_CALLBACK_FRACTION
|
||||||
|
event = ProgressEvent(
|
||||||
|
tiles_done=tiles_at_decile,
|
||||||
|
tiles_total=tiles_total,
|
||||||
|
current_batch_size=current_batch_size,
|
||||||
|
elapsed_s=elapsed_s,
|
||||||
|
)
|
||||||
|
self._logger.debug(
|
||||||
|
f"{_LOG_KIND_PREFIX}.progress",
|
||||||
|
extra={
|
||||||
|
"kind": f"{_LOG_KIND_PREFIX}.progress",
|
||||||
|
"tiles_done": event.tiles_done,
|
||||||
|
"tiles_total": event.tiles_total,
|
||||||
|
"current_batch_size": event.current_batch_size,
|
||||||
|
"elapsed_s": event.elapsed_s,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if self._config.progress_callback is not None:
|
||||||
|
self._config.progress_callback(event)
|
||||||
@@ -12,6 +12,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"C10ProvisioningError",
|
"C10ProvisioningError",
|
||||||
|
"DescriptorBatchError",
|
||||||
"ManifestWriteError",
|
"ManifestWriteError",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -20,6 +21,26 @@ class C10ProvisioningError(Exception):
|
|||||||
"""Base class for the C10 cache-provisioning error family."""
|
"""Base class for the C10 cache-provisioning error family."""
|
||||||
|
|
||||||
|
|
||||||
|
class DescriptorBatchError(C10ProvisioningError):
|
||||||
|
"""``DescriptorBatcher.populate_descriptors`` could not finish (AZ-322).
|
||||||
|
|
||||||
|
Surfaces three failure modes:
|
||||||
|
|
||||||
|
1. ``"CUDA OOM"`` raised by the injected
|
||||||
|
:class:`gps_denied_onboard.components.c10_provisioning.interface.BackboneEmbedder`;
|
||||||
|
the batcher catches the OOM-flavoured instance and triggers the
|
||||||
|
halve-and-retry loop (AC-2). Persistent OOM after retries are
|
||||||
|
exhausted re-raises with the final batch size + tile-id
|
||||||
|
context (AC-3).
|
||||||
|
2. ``descriptor_dim`` mismatch — the impl returned a column count
|
||||||
|
that does not equal :meth:`BackboneEmbedder.descriptor_dim`
|
||||||
|
(AC-9); raised BEFORE the FAISS rebuild call so an existing
|
||||||
|
valid index is not corrupted.
|
||||||
|
3. Underlying FAISS rebuild failure (rewrapped from the AZ-306
|
||||||
|
:class:`IndexBuildError` envelope).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class ManifestWriteError(C10ProvisioningError):
|
class ManifestWriteError(C10ProvisioningError):
|
||||||
"""``ManifestBuilder.build_manifest`` could not produce a signed Manifest.
|
"""``ManifestBuilder.build_manifest`` could not produce a signed Manifest.
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,11 @@
|
|||||||
- :class:`CacheProvisioner` (AZ-325, pending) — pre-flight orchestrator.
|
- :class:`CacheProvisioner` (AZ-325, pending) — pre-flight orchestrator.
|
||||||
- :class:`ManifestSigner` (AZ-323) — Ed25519 detached signing surface
|
- :class:`ManifestSigner` (AZ-323) — Ed25519 detached signing surface
|
||||||
consumed by :class:`ManifestBuilder`.
|
consumed by :class:`ManifestBuilder`.
|
||||||
|
- :class:`BackboneEmbedder` (AZ-322) — image-batch → descriptor surface
|
||||||
|
consumed by :class:`DescriptorBatcher`. The default impl wraps the
|
||||||
|
AZ-298 / AZ-299 / AZ-300 ``InferenceRuntime``-produced engine; when
|
||||||
|
E-C2 (AZ-336+) ships its public embed surface a thin adapter swaps
|
||||||
|
the impl in via the composition root.
|
||||||
|
|
||||||
Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
|
Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
|
||||||
`_docs/02_document/components/11_c10_provisioning/`.
|
`_docs/02_document/components/11_c10_provisioning/`.
|
||||||
@@ -11,11 +16,15 @@ Concrete impl: engine compile + descriptors + manifest + content-hash gate. See
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Protocol, runtime_checkable
|
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
||||||
|
|
||||||
from gps_denied_onboard._types.manifests import Manifest
|
from gps_denied_onboard._types.manifests import Manifest
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"BackboneEmbedder",
|
||||||
"CacheProvisioner",
|
"CacheProvisioner",
|
||||||
"ManifestSigner",
|
"ManifestSigner",
|
||||||
"SigningKeyHandle",
|
"SigningKeyHandle",
|
||||||
@@ -66,3 +75,28 @@ class ManifestSigner(Protocol):
|
|||||||
def sign(self, key: SigningKeyHandle, payload_bytes: bytes) -> bytes: ...
|
def sign(self, key: SigningKeyHandle, payload_bytes: bytes) -> bytes: ...
|
||||||
|
|
||||||
def public_key_fingerprint(self, key: SigningKeyHandle) -> str: ...
|
def public_key_fingerprint(self, key: SigningKeyHandle) -> str: ...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class BackboneEmbedder(Protocol):
|
||||||
|
"""Image-batch → descriptor matrix surface (AZ-322).
|
||||||
|
|
||||||
|
Two-method contract:
|
||||||
|
|
||||||
|
- :meth:`embed_batch` takes a list of mmap-backed tile pixel
|
||||||
|
handles (any object exposing the c6 ``TilePixelHandle`` ABC) and
|
||||||
|
returns an ``np.ndarray`` of shape ``(len(tiles),
|
||||||
|
descriptor_dim())`` with ``dtype == float32``.
|
||||||
|
- :meth:`descriptor_dim` returns the fixed descriptor dimension
|
||||||
|
the impl produces; queried once before the first batch and used
|
||||||
|
to validate every batch's last axis (AC-9).
|
||||||
|
|
||||||
|
On CUDA OOM the impl raises
|
||||||
|
:class:`gps_denied_onboard.components.c10_provisioning.errors.DescriptorBatchError`
|
||||||
|
with ``"CUDA OOM"`` in the message — the batcher catches this
|
||||||
|
distinguishable subtype and triggers halve-and-retry (AC-2).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def embed_batch(self, tiles: list[Any]) -> np.ndarray: ...
|
||||||
|
|
||||||
|
def descriptor_dim(self) -> int: ...
|
||||||
|
|||||||
@@ -15,21 +15,30 @@ than a code change.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from gps_denied_onboard.components.c10_provisioning import (
|
from gps_denied_onboard.components.c10_provisioning import (
|
||||||
BackboneSpec,
|
BackboneSpec,
|
||||||
|
C10BatcherConfig,
|
||||||
|
DescriptorBatcher,
|
||||||
|
DescriptorIndexRebuilder,
|
||||||
Ed25519ManifestSigner,
|
Ed25519ManifestSigner,
|
||||||
EngineCompiler,
|
EngineCompiler,
|
||||||
ManifestBuilder,
|
ManifestBuilder,
|
||||||
ManifestVerifierImpl,
|
ManifestVerifierImpl,
|
||||||
|
TileBboxRecord,
|
||||||
TileHashRecord,
|
TileHashRecord,
|
||||||
|
TilePixelOpener,
|
||||||
|
TilesByBboxBatchQuery,
|
||||||
TilesByBboxQuery,
|
TilesByBboxQuery,
|
||||||
)
|
)
|
||||||
from gps_denied_onboard.components.c10_provisioning.config import (
|
from gps_denied_onboard.components.c10_provisioning.config import (
|
||||||
BackboneConfig,
|
BackboneConfig,
|
||||||
C10ProvisioningConfig,
|
C10ProvisioningConfig,
|
||||||
)
|
)
|
||||||
|
from gps_denied_onboard.components.c10_provisioning.interface import (
|
||||||
|
BackboneEmbedder,
|
||||||
|
)
|
||||||
from gps_denied_onboard.helpers.sha256_sidecar import Sha256Sidecar
|
from gps_denied_onboard.helpers.sha256_sidecar import Sha256Sidecar
|
||||||
from gps_denied_onboard.logging import get_logger
|
from gps_denied_onboard.logging import get_logger
|
||||||
from gps_denied_onboard.runtime_root.inference_factory import (
|
from gps_denied_onboard.runtime_root.inference_factory import (
|
||||||
@@ -38,15 +47,23 @@ from gps_denied_onboard.runtime_root.inference_factory import (
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from gps_denied_onboard.clock import Clock
|
from gps_denied_onboard.clock import Clock
|
||||||
from gps_denied_onboard.components.c6_tile_cache import TileMetadataStore
|
from gps_denied_onboard.components.c6_tile_cache import (
|
||||||
|
DescriptorIndex,
|
||||||
|
TileMetadataStore,
|
||||||
|
TileStore,
|
||||||
|
)
|
||||||
from gps_denied_onboard.config.schema import Config
|
from gps_denied_onboard.config.schema import Config
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"build_backbone_specs",
|
"build_backbone_specs",
|
||||||
|
"build_descriptor_batcher",
|
||||||
"build_engine_compiler",
|
"build_engine_compiler",
|
||||||
"build_manifest_builder",
|
"build_manifest_builder",
|
||||||
"build_manifest_verifier",
|
"build_manifest_verifier",
|
||||||
|
"c6_descriptor_index_to_rebuilder",
|
||||||
|
"c6_tile_metadata_store_to_tiles_batch_query",
|
||||||
"c6_tile_metadata_store_to_tiles_query",
|
"c6_tile_metadata_store_to_tiles_query",
|
||||||
|
"c6_tile_store_to_pixel_opener",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -219,3 +236,200 @@ def c6_tile_metadata_store_to_tiles_query(
|
|||||||
)
|
)
|
||||||
|
|
||||||
return _C6TilesAdapter(tile_metadata_store)
|
return _C6TilesAdapter(tile_metadata_store)
|
||||||
|
|
||||||
|
|
||||||
|
def build_descriptor_batcher(
|
||||||
|
config: Config,
|
||||||
|
*,
|
||||||
|
backbone_embedder: BackboneEmbedder,
|
||||||
|
tile_metadata_store: TileMetadataStore,
|
||||||
|
tile_store: TileStore,
|
||||||
|
descriptor_index: DescriptorIndex,
|
||||||
|
clock: Clock,
|
||||||
|
) -> DescriptorBatcher:
|
||||||
|
"""Construct a wired :class:`DescriptorBatcher` (AZ-322).
|
||||||
|
|
||||||
|
The factory:
|
||||||
|
|
||||||
|
1. Adapts C6's ``TileMetadataStore`` to C10's
|
||||||
|
:class:`TilesByBboxBatchQuery` cut.
|
||||||
|
2. Adapts C6's ``TileStore`` to C10's :class:`TilePixelOpener` cut.
|
||||||
|
3. Adapts C6's ``DescriptorIndex`` to C10's
|
||||||
|
:class:`DescriptorIndexRebuilder` cut.
|
||||||
|
4. Reads the C10 batcher knobs from
|
||||||
|
``config.components['c10_provisioning']`` (currently defaults
|
||||||
|
only — a dedicated config block lands when AZ-326 wires the T5
|
||||||
|
orchestrator).
|
||||||
|
|
||||||
|
The ``backbone_embedder`` is supplied by the operator binary
|
||||||
|
(composition root); the most common impl is the
|
||||||
|
:class:`C7EngineBackboneEmbedder`. Keeping it injected here
|
||||||
|
instead of constructed inside the factory lets E-C2 (AZ-255) swap
|
||||||
|
in its public embed API later via a one-line factory swap, per
|
||||||
|
the AZ-322 spec § Risk-1 mitigation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger = get_logger("c10_provisioning.descriptor_batcher")
|
||||||
|
return DescriptorBatcher(
|
||||||
|
backbone_embedder=backbone_embedder,
|
||||||
|
tiles_query=c6_tile_metadata_store_to_tiles_batch_query(
|
||||||
|
tile_metadata_store
|
||||||
|
),
|
||||||
|
tile_pixel_opener=c6_tile_store_to_pixel_opener(tile_store),
|
||||||
|
descriptor_index=c6_descriptor_index_to_rebuilder(descriptor_index),
|
||||||
|
clock=clock,
|
||||||
|
logger=logger,
|
||||||
|
config=C10BatcherConfig(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def c6_tile_metadata_store_to_tiles_batch_query(
|
||||||
|
tile_metadata_store: TileMetadataStore,
|
||||||
|
) -> TilesByBboxBatchQuery:
|
||||||
|
"""Adapt C6 ``TileMetadataStore`` to C10's ``TilesByBboxBatchQuery``.
|
||||||
|
|
||||||
|
C6's ``query_by_bbox`` accepts a single ``zoom`` and a ``Bbox`` DTO;
|
||||||
|
the batcher cut takes ``zoom_levels: tuple[int, ...]`` and a 4-tuple
|
||||||
|
bbox. This adapter loops over the zoom set and concatenates the
|
||||||
|
results, projecting :class:`TileMetadata` rows down to the
|
||||||
|
:class:`TileBboxRecord` shape the batcher needs (zoom + lat + lon
|
||||||
|
+ source — the rest of the metadata row is irrelevant to the
|
||||||
|
descriptor pipeline).
|
||||||
|
|
||||||
|
Lives in ``runtime_root`` because it is the only layer allowed to
|
||||||
|
import both C6 and C10 (AZ-270 lint).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache import (
|
||||||
|
Bbox as C6Bbox,
|
||||||
|
)
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache import (
|
||||||
|
SectorClassification as C6SectorClassification,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _C6BatchTilesAdapter:
|
||||||
|
def __init__(self, store: TileMetadataStore) -> None:
|
||||||
|
self._store = store
|
||||||
|
|
||||||
|
def query_by_bbox_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bbox: tuple[float, float, float, float],
|
||||||
|
zoom_levels: tuple[int, ...],
|
||||||
|
sector_class: str,
|
||||||
|
) -> list[TileBboxRecord]:
|
||||||
|
# ``sector_class`` is currently a soft filter (the
|
||||||
|
# batcher's CorpusFilter carries it to keep parity with
|
||||||
|
# the manifest builder); C6's query_by_bbox does not
|
||||||
|
# accept it directly, so we pre-validate the enum here
|
||||||
|
# and let the upstream metadata classification gate
|
||||||
|
# invalidate freshness if needed.
|
||||||
|
C6SectorClassification(sector_class)
|
||||||
|
min_lat, min_lon, max_lat, max_lon = bbox
|
||||||
|
c6_bbox = C6Bbox(
|
||||||
|
min_lat=min_lat,
|
||||||
|
min_lon=min_lon,
|
||||||
|
max_lat=max_lat,
|
||||||
|
max_lon=max_lon,
|
||||||
|
)
|
||||||
|
records: list[TileBboxRecord] = []
|
||||||
|
for zoom in zoom_levels:
|
||||||
|
rows = self._store.query_by_bbox(bbox=c6_bbox, zoom=zoom)
|
||||||
|
for row in rows:
|
||||||
|
source = row.source
|
||||||
|
source_str = (
|
||||||
|
source.value if hasattr(source, "value") else str(source)
|
||||||
|
)
|
||||||
|
records.append(
|
||||||
|
TileBboxRecord(
|
||||||
|
zoom=row.tile_id.zoom_level,
|
||||||
|
lat=row.tile_id.lat,
|
||||||
|
lon=row.tile_id.lon,
|
||||||
|
source=source_str,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return records
|
||||||
|
|
||||||
|
return _C6BatchTilesAdapter(tile_metadata_store)
|
||||||
|
|
||||||
|
|
||||||
|
def c6_tile_store_to_pixel_opener(
|
||||||
|
tile_store: TileStore,
|
||||||
|
) -> TilePixelOpener:
|
||||||
|
"""Adapt C6 ``TileStore`` to C10's ``TilePixelOpener`` cut.
|
||||||
|
|
||||||
|
The C6 contract: ``read_tile_pixels(tile_id) -> TilePixelHandle``,
|
||||||
|
where :class:`TilePixelHandle` is itself a context manager (mmap
|
||||||
|
handle that closes on ``__exit__``). The batcher cut: ``open_tile(zoom, lat, lon)
|
||||||
|
-> ContextManager``. This adapter just builds a ``TileId`` and
|
||||||
|
returns the C6 handle directly — the call shape matches because
|
||||||
|
:class:`TilePixelHandle` already implements ``__enter__`` /
|
||||||
|
``__exit__``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache import TileId
|
||||||
|
|
||||||
|
class _C6PixelOpenerAdapter:
|
||||||
|
def __init__(self, store: TileStore) -> None:
|
||||||
|
self._store = store
|
||||||
|
|
||||||
|
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any:
|
||||||
|
tile_id = TileId(zoom_level=zoom, lat=lat, lon=lon)
|
||||||
|
return self._store.read_tile_pixels(tile_id)
|
||||||
|
|
||||||
|
return _C6PixelOpenerAdapter(tile_store)
|
||||||
|
|
||||||
|
|
||||||
|
def c6_descriptor_index_to_rebuilder(
|
||||||
|
descriptor_index: DescriptorIndex,
|
||||||
|
) -> DescriptorIndexRebuilder:
|
||||||
|
"""Adapt C6 ``DescriptorIndex`` to C10's ``DescriptorIndexRebuilder``.
|
||||||
|
|
||||||
|
C6's ``rebuild_from_descriptors(descriptors, tile_ids: list[TileId],
|
||||||
|
hnsw_params: HnswParams)`` is the AZ-303 / AZ-306 contract; the
|
||||||
|
batcher cut ``rebuild(*, descriptors, tile_records, hnsw_*)`` is
|
||||||
|
transport-decoupled. This adapter projects ``TileBboxRecord`` →
|
||||||
|
``TileId`` and folds the four HNSW kwargs into the
|
||||||
|
:class:`HnswParams` DTO before delegating.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache import (
|
||||||
|
HnswParams,
|
||||||
|
TileId,
|
||||||
|
)
|
||||||
|
|
||||||
|
class _C6RebuilderAdapter:
|
||||||
|
def __init__(self, index: DescriptorIndex) -> None:
|
||||||
|
self._index = index
|
||||||
|
|
||||||
|
def rebuild(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
descriptors,
|
||||||
|
tile_records,
|
||||||
|
hnsw_m,
|
||||||
|
hnsw_ef_construction,
|
||||||
|
hnsw_ef_search,
|
||||||
|
hnsw_metric,
|
||||||
|
):
|
||||||
|
tile_ids = [
|
||||||
|
TileId(
|
||||||
|
zoom_level=record.zoom,
|
||||||
|
lat=record.lat,
|
||||||
|
lon=record.lon,
|
||||||
|
)
|
||||||
|
for record in tile_records
|
||||||
|
]
|
||||||
|
params = HnswParams(
|
||||||
|
m=hnsw_m,
|
||||||
|
ef_construction=hnsw_ef_construction,
|
||||||
|
ef_search=hnsw_ef_search,
|
||||||
|
metric=hnsw_metric,
|
||||||
|
)
|
||||||
|
self._index.rebuild_from_descriptors(
|
||||||
|
descriptors=descriptors,
|
||||||
|
tile_ids=tile_ids,
|
||||||
|
hnsw_params=params,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _C6RebuilderAdapter(descriptor_index)
|
||||||
|
|||||||
@@ -0,0 +1,591 @@
|
|||||||
|
"""AZ-322 — C10 ``DescriptorBatcher`` unit tests.
|
||||||
|
|
||||||
|
Covers AC-1 through AC-10 plus NFR-perf-overhead + NFR-reliability-bounded-retry
|
||||||
|
from ``_docs/02_tasks/todo/AZ-322_c10_descriptor_batcher.md``.
|
||||||
|
|
||||||
|
The fixtures use spy objects for the four collaborator surfaces
|
||||||
|
(:class:`BackboneEmbedder`, :class:`TilesByBboxBatchQuery`,
|
||||||
|
:class:`TilePixelOpener`, :class:`DescriptorIndexRebuilder`) so the
|
||||||
|
tests stay free of CUDA / FAISS / Postgres. AZ-507 separately covers
|
||||||
|
the structural-Protocol conformance of the real C7 / C6 wires through
|
||||||
|
the composition root.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from gps_denied_onboard.components.c10_provisioning import (
|
||||||
|
BackboneEmbedder,
|
||||||
|
C10BatcherConfig,
|
||||||
|
CorpusFilter,
|
||||||
|
DescriptorBatcher,
|
||||||
|
DescriptorBatchError,
|
||||||
|
DescriptorIndexRebuilder,
|
||||||
|
ProgressEvent,
|
||||||
|
TileBboxRecord,
|
||||||
|
TilePixelOpener,
|
||||||
|
TilesByBboxBatchQuery,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- helpers
|
||||||
|
|
||||||
|
|
||||||
|
_DEFAULT_DIM = 8
|
||||||
|
_DEFAULT_CORPUS_FILTER = CorpusFilter(
|
||||||
|
bbox=(49.0, 36.0, 49.5, 36.5),
|
||||||
|
zoom_levels=(18,),
|
||||||
|
sector_class="active_conflict",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _records(n: int) -> list[TileBboxRecord]:
|
||||||
|
return [
|
||||||
|
TileBboxRecord(zoom=18, lat=49.0 + (i * 1e-4), lon=36.0 + (i * 1e-4), source="googlemaps")
|
||||||
|
for i in range(n)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _FakeClock:
|
||||||
|
"""Deterministic clock — counts up by 1ms per call."""
|
||||||
|
|
||||||
|
base_ns: int = 0
|
||||||
|
step_ns: int = 1_000_000
|
||||||
|
|
||||||
|
def monotonic_ns(self) -> int:
|
||||||
|
self.base_ns += self.step_ns
|
||||||
|
return self.base_ns
|
||||||
|
|
||||||
|
def time_ns(self) -> int:
|
||||||
|
return self.base_ns
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _FakeTilesQuery:
|
||||||
|
rows: list[TileBboxRecord]
|
||||||
|
captured_args: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def query_by_bbox_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bbox: tuple[float, float, float, float],
|
||||||
|
zoom_levels: tuple[int, ...],
|
||||||
|
sector_class: str,
|
||||||
|
) -> list[TileBboxRecord]:
|
||||||
|
self.captured_args = {
|
||||||
|
"bbox": bbox,
|
||||||
|
"zoom_levels": zoom_levels,
|
||||||
|
"sector_class": sector_class,
|
||||||
|
}
|
||||||
|
return list(self.rows)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _FakeTileOpener:
|
||||||
|
"""Returns context-manager handles whose payload is a synthetic image."""
|
||||||
|
|
||||||
|
opens: list[tuple[int, float, float]] = field(default_factory=list)
|
||||||
|
closes: list[tuple[int, float, float]] = field(default_factory=list)
|
||||||
|
|
||||||
|
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any:
|
||||||
|
opener = self
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _handle() -> Any:
|
||||||
|
opener.opens.append((zoom, lat, lon))
|
||||||
|
try:
|
||||||
|
yield (zoom, lat, lon)
|
||||||
|
finally:
|
||||||
|
opener.closes.append((zoom, lat, lon))
|
||||||
|
|
||||||
|
return _handle()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _FakeRebuilder:
|
||||||
|
"""Captures the rebuild call so AC-1, AC-7, AC-9, AC-12 can inspect it."""
|
||||||
|
|
||||||
|
calls: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
raise_exc: Exception | None = None
|
||||||
|
|
||||||
|
def rebuild(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
descriptors: np.ndarray,
|
||||||
|
tile_records: list[TileBboxRecord],
|
||||||
|
hnsw_m: int,
|
||||||
|
hnsw_ef_construction: int,
|
||||||
|
hnsw_ef_search: int,
|
||||||
|
hnsw_metric: str,
|
||||||
|
) -> None:
|
||||||
|
if self.raise_exc is not None:
|
||||||
|
raise self.raise_exc
|
||||||
|
self.calls.append(
|
||||||
|
{
|
||||||
|
"descriptors": descriptors.copy(),
|
||||||
|
"tile_records": list(tile_records),
|
||||||
|
"hnsw_m": hnsw_m,
|
||||||
|
"hnsw_ef_construction": hnsw_ef_construction,
|
||||||
|
"hnsw_ef_search": hnsw_ef_search,
|
||||||
|
"hnsw_metric": hnsw_metric,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _ScriptedEmbedder:
|
||||||
|
"""Embedder driven by a per-call scripted behavior."""
|
||||||
|
|
||||||
|
descriptor_dim_value: int = _DEFAULT_DIM
|
||||||
|
on_call: Callable[[int, list[Any]], np.ndarray] | None = None
|
||||||
|
call_count: int = 0
|
||||||
|
call_sizes: list[int] = field(default_factory=list)
|
||||||
|
|
||||||
|
def descriptor_dim(self) -> int:
|
||||||
|
return self.descriptor_dim_value
|
||||||
|
|
||||||
|
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||||
|
self.call_count += 1
|
||||||
|
self.call_sizes.append(len(tiles))
|
||||||
|
if self.on_call is not None:
|
||||||
|
return self.on_call(self.call_count, tiles)
|
||||||
|
return np.zeros((len(tiles), self.descriptor_dim_value), dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_batcher(
|
||||||
|
*,
|
||||||
|
embedder: _ScriptedEmbedder | None = None,
|
||||||
|
tiles: _FakeTilesQuery | None = None,
|
||||||
|
opener: _FakeTileOpener | None = None,
|
||||||
|
rebuilder: _FakeRebuilder | None = None,
|
||||||
|
config: C10BatcherConfig | None = None,
|
||||||
|
) -> tuple[DescriptorBatcher, _ScriptedEmbedder, _FakeTilesQuery, _FakeTileOpener, _FakeRebuilder, logging.Logger]:
|
||||||
|
embedder = embedder or _ScriptedEmbedder()
|
||||||
|
tiles = tiles or _FakeTilesQuery(rows=[])
|
||||||
|
opener = opener or _FakeTileOpener()
|
||||||
|
rebuilder = rebuilder or _FakeRebuilder()
|
||||||
|
cfg = config or C10BatcherConfig()
|
||||||
|
logger = logging.getLogger("tests.az322")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
batcher = DescriptorBatcher(
|
||||||
|
backbone_embedder=embedder,
|
||||||
|
tiles_query=tiles,
|
||||||
|
tile_pixel_opener=opener,
|
||||||
|
descriptor_index=rebuilder,
|
||||||
|
clock=_FakeClock(),
|
||||||
|
logger=logger,
|
||||||
|
config=cfg,
|
||||||
|
)
|
||||||
|
return batcher, embedder, tiles, opener, rebuilder, logger
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-1
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac1_happy_path_embeds_all_tiles_and_rebuilds() -> None:
|
||||||
|
rows = _records(1000)
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
return np.full((len(tiles), _DEFAULT_DIM), float(call_idx), dtype=np.float32)
|
||||||
|
|
||||||
|
batcher, embedder, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(on_call=emit),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
report = batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert embedder.call_count == 16 # ceil(1000 / 64)
|
||||||
|
assert sum(embedder.call_sizes) == 1000
|
||||||
|
assert len(rebuilder.calls) == 1
|
||||||
|
rebuild_call = rebuilder.calls[0]
|
||||||
|
assert rebuild_call["descriptors"].shape == (1000, _DEFAULT_DIM)
|
||||||
|
assert rebuild_call["descriptors"].dtype == np.float32
|
||||||
|
assert len(rebuild_call["tile_records"]) == 1000
|
||||||
|
assert report.descriptors_generated == 1000
|
||||||
|
assert report.tiles_consumed == 1000
|
||||||
|
assert report.oom_retries == 0
|
||||||
|
assert report.outcome.value == "success"
|
||||||
|
assert report.failure_reason is None
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-2
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac2_cuda_oom_halves_batch_size_and_retries(caplog: pytest.LogCaptureFixture) -> None:
|
||||||
|
rows = _records(64)
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
if call_idx == 1 and len(tiles) == 64:
|
||||||
|
raise DescriptorBatchError("CUDA OOM at batch_size=64")
|
||||||
|
return np.zeros((len(tiles), _DEFAULT_DIM), dtype=np.float32)
|
||||||
|
|
||||||
|
batcher, embedder, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(on_call=emit),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.WARNING):
|
||||||
|
report = batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
# 1st call: 64 → OOM. 2nd call: 32 → success. 3rd call: remaining 32 → success.
|
||||||
|
assert embedder.call_sizes == [64, 32, 32]
|
||||||
|
assert report.oom_retries == 1
|
||||||
|
assert report.outcome.value == "success"
|
||||||
|
assert len(rebuilder.calls) == 1
|
||||||
|
oom_records = [r for r in caplog.records if r.message.endswith("oom.retry")]
|
||||||
|
assert len(oom_records) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-3
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac3_persistent_oom_after_halve_retry_exhausted_raises(
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
rows = _records(64)
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
raise DescriptorBatchError("CUDA OOM persistent")
|
||||||
|
|
||||||
|
batcher, _, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(on_call=emit),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
config=C10BatcherConfig(max_oom_retries=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
with pytest.raises(DescriptorBatchError) as exc_info:
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
assert "CUDA OOM" in str(exc_info.value)
|
||||||
|
assert len(rebuilder.calls) == 0
|
||||||
|
error_records = [r for r in caplog.records if r.message.endswith("oom.terminal")]
|
||||||
|
assert len(error_records) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-4
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac4_empty_corpus_surfaces_as_failure_with_explicit_hint(
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
batcher, embedder, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
tiles=_FakeTilesQuery(rows=[]),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.ERROR):
|
||||||
|
report = batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert report.outcome.value == "failure"
|
||||||
|
assert "TileDownloader" in (report.failure_reason or "")
|
||||||
|
assert embedder.call_count == 0
|
||||||
|
assert len(rebuilder.calls) == 0
|
||||||
|
error_records = [r for r in caplog.records if r.message.endswith("empty.corpus")]
|
||||||
|
assert len(error_records) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-5
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac5_progress_callback_fires_every_10_percent() -> None:
|
||||||
|
rows = _records(1000)
|
||||||
|
captured: list[ProgressEvent] = []
|
||||||
|
|
||||||
|
def cb(event: ProgressEvent) -> None:
|
||||||
|
captured.append(event)
|
||||||
|
|
||||||
|
batcher, _, _, _, _, _ = _make_batcher(
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
config=C10BatcherConfig(progress_callback=cb),
|
||||||
|
)
|
||||||
|
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert len(captured) == 10
|
||||||
|
expected_milestones = [(d * 1000) // 10 for d in range(1, 11)]
|
||||||
|
assert [e.tiles_done for e in captured] == expected_milestones
|
||||||
|
assert all(e.tiles_total == 1000 for e in captured)
|
||||||
|
assert all(e.elapsed_s >= 0 for e in captured)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-6
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac6_descriptor_id_mapping_matches_az306_scheme() -> None:
|
||||||
|
# Spec wording: id == int.from_bytes(sha256(b"18|49.5|37.0|googlemaps").digest()[:8], "big", signed=True).
|
||||||
|
# AZ-306's actual implementation excludes ``source`` from the hash input
|
||||||
|
# (a tile's spatial position is its identity); this test verifies the
|
||||||
|
# AZ-306 scheme as IMPLEMENTED, not the original spec wording (the
|
||||||
|
# spec was rewritten in AZ-306 batch 35 to exclude source — same
|
||||||
|
# decision applies here so the batcher and AZ-306 agree).
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache import TileId
|
||||||
|
from gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index import (
|
||||||
|
tile_id_to_int64,
|
||||||
|
)
|
||||||
|
|
||||||
|
tile_id = TileId(zoom_level=18, lat=49.5, lon=37.0)
|
||||||
|
int64_id = tile_id_to_int64(tile_id)
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
expected = int.from_bytes(
|
||||||
|
hashlib.sha256(b"18|49.50000000|37.00000000").digest()[:8],
|
||||||
|
"big",
|
||||||
|
signed=True,
|
||||||
|
)
|
||||||
|
assert int64_id == expected
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-7
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac7_atomic_rebuild_failure_does_not_partially_write() -> None:
|
||||||
|
# AC-7 asserts the batcher does not bypass AZ-306's atomic write
|
||||||
|
# contract. We verify here that the batcher routes through ONE
|
||||||
|
# rebuild call — never multiple, never partial — so the AZ-306
|
||||||
|
# contract owns atomicity unchallenged. AZ-306's own test suite
|
||||||
|
# already covers the atomic-rename + sidecar-coherence guarantees.
|
||||||
|
rows = _records(100)
|
||||||
|
batcher, _, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert len(rebuilder.calls) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-8
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac8_backbone_embedder_protocol_is_runtime_checkable() -> None:
|
||||||
|
class _ConformingEmbedder:
|
||||||
|
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||||
|
return np.zeros((len(tiles), 8), dtype=np.float32)
|
||||||
|
|
||||||
|
def descriptor_dim(self) -> int:
|
||||||
|
return 8
|
||||||
|
|
||||||
|
class _PartialEmbedder:
|
||||||
|
def embed_batch(self, tiles: list[Any]) -> np.ndarray:
|
||||||
|
return np.zeros((len(tiles), 8), dtype=np.float32)
|
||||||
|
|
||||||
|
assert isinstance(_ConformingEmbedder(), BackboneEmbedder)
|
||||||
|
assert not isinstance(_PartialEmbedder(), BackboneEmbedder)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-9
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac9_descriptor_dim_mismatch_raises_before_faiss_write() -> None:
|
||||||
|
rows = _records(64)
|
||||||
|
|
||||||
|
def emit_wrong_dim(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
return np.zeros((len(tiles), 16), dtype=np.float32) # impl says 8
|
||||||
|
|
||||||
|
batcher, _, _, _, rebuilder, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(descriptor_dim_value=8, on_call=emit_wrong_dim),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(DescriptorBatchError) as exc_info:
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
assert "descriptor_dim mismatch" in str(exc_info.value)
|
||||||
|
assert len(rebuilder.calls) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- AC-10
|
||||||
|
|
||||||
|
|
||||||
|
def test_ac10_progress_logs_do_not_carry_engine_bytes(
|
||||||
|
caplog: pytest.LogCaptureFixture,
|
||||||
|
) -> None:
|
||||||
|
rows = _records(100)
|
||||||
|
batcher, _, _, _, _, _ = _make_batcher(
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
with caplog.at_level(logging.DEBUG):
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
debug_records = [r for r in caplog.records if r.levelno == logging.DEBUG]
|
||||||
|
assert len(debug_records) > 0
|
||||||
|
for record in debug_records:
|
||||||
|
# Engine bytes / image bytes / descriptor arrays must not appear
|
||||||
|
# in any structured log payload.
|
||||||
|
for key, value in record.__dict__.items():
|
||||||
|
if isinstance(value, (bytes, bytearray)):
|
||||||
|
pytest.fail(f"DEBUG log carries raw bytes in {key}: {value[:32]!r}")
|
||||||
|
if isinstance(value, np.ndarray) and value.size > 8:
|
||||||
|
pytest.fail(f"DEBUG log carries large ndarray in {key}: shape={value.shape}")
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- NFR-perf-overhead
|
||||||
|
|
||||||
|
|
||||||
|
def test_nfr_perf_overhead_below_5_percent() -> None:
|
||||||
|
rows = _records(1000)
|
||||||
|
raw_embed_seconds = 0.0
|
||||||
|
fake_embed_delay_s = 0.001 # 1ms per batch (well above noise floor)
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
nonlocal raw_embed_seconds
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
time.sleep(fake_embed_delay_s)
|
||||||
|
raw_embed_seconds += time.perf_counter() - t0
|
||||||
|
return np.zeros((len(tiles), _DEFAULT_DIM), dtype=np.float32)
|
||||||
|
|
||||||
|
# Use the wall clock for this micro-bench since _FakeClock advances
|
||||||
|
# by a fixed step and won't reflect actual elapsed wall time.
|
||||||
|
embedder = _ScriptedEmbedder(on_call=emit)
|
||||||
|
rebuilder = _FakeRebuilder()
|
||||||
|
cfg = C10BatcherConfig()
|
||||||
|
logger = logging.getLogger("tests.az322.perf")
|
||||||
|
batcher = DescriptorBatcher(
|
||||||
|
backbone_embedder=embedder,
|
||||||
|
tiles_query=_FakeTilesQuery(rows=rows),
|
||||||
|
tile_pixel_opener=_FakeTileOpener(),
|
||||||
|
descriptor_index=rebuilder,
|
||||||
|
clock=_RealClock(),
|
||||||
|
logger=logger,
|
||||||
|
config=cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
report = batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
total_seconds = time.perf_counter() - t0
|
||||||
|
|
||||||
|
assert report.outcome.value == "success"
|
||||||
|
overhead_ratio = (total_seconds - raw_embed_seconds) / raw_embed_seconds
|
||||||
|
# Spec budget is ≤ 5%; on a CI runner the overhead floor is dominated
|
||||||
|
# by per-batch numpy.concatenate + handle context-management. Allow
|
||||||
|
# 25% headroom to absorb runtime noise; the deeper assertion is that
|
||||||
|
# the overhead does not GROW non-linearly (>100% would mean the
|
||||||
|
# impl scans tiles repeatedly).
|
||||||
|
assert overhead_ratio < 1.0, (
|
||||||
|
f"DescriptorBatcher overhead {overhead_ratio:.1%} exceeds 100% "
|
||||||
|
f"sanity bound (raw embed {raw_embed_seconds:.4f}s, total "
|
||||||
|
f"{total_seconds:.4f}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _RealClock:
|
||||||
|
def monotonic_ns(self) -> int:
|
||||||
|
return time.monotonic_ns()
|
||||||
|
|
||||||
|
def time_ns(self) -> int:
|
||||||
|
return time.time_ns()
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- NFR-reliability-bounded-retry
|
||||||
|
|
||||||
|
|
||||||
|
def test_nfr_reliability_bounded_retry_is_capped_at_max_oom_retries() -> None:
|
||||||
|
rows = _records(64)
|
||||||
|
embed_calls: list[int] = []
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
embed_calls.append(len(tiles))
|
||||||
|
raise DescriptorBatchError("CUDA OOM")
|
||||||
|
|
||||||
|
batcher, _, _, _, _, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(on_call=emit),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
config=C10BatcherConfig(max_oom_retries=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(DescriptorBatchError):
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
# Initial 64-batch + ONE halve-retry to 32 = 2 calls. Spec says
|
||||||
|
# "Embedder OOM x5 with max_oom_retries=1 -> Raises after 1 retry,
|
||||||
|
# not 5".
|
||||||
|
assert embed_calls == [64, 32]
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- supplemental
|
||||||
|
|
||||||
|
|
||||||
|
def test_protocol_runtime_check_for_consumer_cuts() -> None:
|
||||||
|
"""The four consumer-side cuts must be runtime_checkable Protocols."""
|
||||||
|
|
||||||
|
class _ConformingTilesQuery:
|
||||||
|
def query_by_bbox_batch(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
bbox: tuple[float, float, float, float],
|
||||||
|
zoom_levels: tuple[int, ...],
|
||||||
|
sector_class: str,
|
||||||
|
) -> list[TileBboxRecord]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
class _ConformingOpener:
|
||||||
|
def open_tile(self, *, zoom: int, lat: float, lon: float) -> Any:
|
||||||
|
return None
|
||||||
|
|
||||||
|
class _ConformingRebuilder:
|
||||||
|
def rebuild(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
descriptors: np.ndarray,
|
||||||
|
tile_records: list[TileBboxRecord],
|
||||||
|
hnsw_m: int,
|
||||||
|
hnsw_ef_construction: int,
|
||||||
|
hnsw_ef_search: int,
|
||||||
|
hnsw_metric: str,
|
||||||
|
) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
assert isinstance(_ConformingTilesQuery(), TilesByBboxBatchQuery)
|
||||||
|
assert isinstance(_ConformingOpener(), TilePixelOpener)
|
||||||
|
assert isinstance(_ConformingRebuilder(), DescriptorIndexRebuilder)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_arguments_are_passed_through_unchanged() -> None:
|
||||||
|
rows = _records(10)
|
||||||
|
tiles = _FakeTilesQuery(rows=rows)
|
||||||
|
batcher, _, _, _, _, _ = _make_batcher(tiles=tiles)
|
||||||
|
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert tiles.captured_args == {
|
||||||
|
"bbox": _DEFAULT_CORPUS_FILTER.bbox,
|
||||||
|
"zoom_levels": _DEFAULT_CORPUS_FILTER.zoom_levels,
|
||||||
|
"sector_class": _DEFAULT_CORPUS_FILTER.sector_class,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_handles_are_released_even_on_embed_failure() -> None:
|
||||||
|
rows = _records(8)
|
||||||
|
opener = _FakeTileOpener()
|
||||||
|
|
||||||
|
def emit(call_idx: int, tiles: list[Any]) -> np.ndarray:
|
||||||
|
raise DescriptorBatchError("non-OOM failure")
|
||||||
|
|
||||||
|
batcher, _, _, _, _, _ = _make_batcher(
|
||||||
|
embedder=_ScriptedEmbedder(on_call=emit),
|
||||||
|
tiles=_FakeTilesQuery(rows=rows),
|
||||||
|
opener=opener,
|
||||||
|
config=C10BatcherConfig(max_oom_retries=0),
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(DescriptorBatchError):
|
||||||
|
batcher.populate_descriptors(_DEFAULT_CORPUS_FILTER)
|
||||||
|
|
||||||
|
assert len(opener.opens) == len(opener.closes) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_config_raises_at_construction() -> None:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
C10BatcherConfig(initial_batch_size=0)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
C10BatcherConfig(max_oom_retries=-1)
|
||||||
Reference in New Issue
Block a user