From 3b7265757b198eb87e03f2e22805322e528fbf92 Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Wed, 13 May 2026 04:01:37 +0300 Subject: [PATCH] [AZ-306] C6 FaissDescriptorIndex (faiss-cpu, HNSW32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production-default DescriptorIndex strategy backed by the faiss-cpu PyPI wheel (>=1.7,<2.0). Implements the AZ-303 Protocol surface end to end: HNSW32 + IndexIDMap2 search, atomic three-file rebuild (.index + .sha256 sidecar + .meta.json), triple-consistency load check, mmap-backed reads with IO_FLAG_MMAP|IO_FLAG_READ_ONLY, optional warm-up query at construction, FAISS RuntimeError rewrap to IndexUnavailableError / IndexBuildError, and FaissDescriptorIndex.from_config classmethod wired into runtime_root.storage_factory. The original spec required a custom pybind11 wrapper over a vendored FAISS HEAD; the user opted for the upstream faiss-cpu wheel after research fact #92 confirmed ARM64 wheel availability for Jetson and the existing pyproject.toml already pinned faiss-cpu. cpp/faiss_index/ placeholder removed; BUILD_FAISS_INDEX flag retained as a runtime/factory gate (no native target). Spec rewritten end-to-end and archived to _docs/02_tasks/done/. C6TileCacheConfig extended with faiss_index_path and faiss_warmup_query_path fields. tests/conftest.py sets KMP_DUPLICATE_LIB_OK=TRUE to remediate the macOS faiss/torch libomp duplicate-load abort during pytest (no-op on CI Linux). 21 new tests cover AC-1..12 + 2 NFRs + from_config smoke; AZ-303 protocol-conformance fake updated with from_config classmethod. Tests: 124/124 c6_tile_cache pass; 1334 project-wide pass; 1 pre-existing OKVIS2 submodule failure unrelated. Doc sync: module-layout.md, components/08_c6_tile_cache/description.md §5, batch_35_cycle1_report.md. Co-authored-by: Cursor --- .../08_c6_tile_cache/description.md | 2 +- _docs/02_document/module-layout.md | 6 +- .../AZ-306_c6_faiss_descriptor_index.md | 135 ++-- .../batch_35_cycle1_report.md | 175 +++++ _docs/_autodev_state.md | 4 +- cmake/build_options.cmake | 2 +- cpp/CMakeLists.txt | 8 +- cpp/faiss_index/CMakeLists.txt | 4 - pyproject.toml | 14 +- .../c6_tile_cache/_native/__init__.py | 1 - .../components/c6_tile_cache/config.py | 17 + .../c6_tile_cache/faiss_descriptor_index.py | 597 ++++++++++++++++ .../postgres_filesystem_store.py | 2 +- .../runtime_root/storage_factory.py | 2 +- tests/conftest.py | 11 + .../test_faiss_descriptor_index.py | 650 ++++++++++++++++++ .../test_protocol_conformance.py | 7 + 17 files changed, 1550 insertions(+), 87 deletions(-) rename _docs/02_tasks/{todo => done}/AZ-306_c6_faiss_descriptor_index.md (50%) create mode 100644 _docs/03_implementation/batch_35_cycle1_report.md delete mode 100644 cpp/faiss_index/CMakeLists.txt delete mode 100644 src/gps_denied_onboard/components/c6_tile_cache/_native/__init__.py create mode 100644 src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py create mode 100644 tests/unit/c6_tile_cache/test_faiss_descriptor_index.py diff --git a/_docs/02_document/components/08_c6_tile_cache/description.md b/_docs/02_document/components/08_c6_tile_cache/description.md index 3c2af75..ea10798 100644 --- a/_docs/02_document/components/08_c6_tile_cache/description.md +++ b/_docs/02_document/components/08_c6_tile_cache/description.md @@ -119,7 +119,7 @@ Not applicable — internal-only; C11 `TileUploader` reads via `TileStore` for u |---------|---------|---------| | PostgreSQL (server + libpq) | 16.x (mirror of `satellite-provider`'s pin) | Spatial metadata index | | psycopg / asyncpg | per project pin | Python Postgres client | -| FAISS (Python + C++) | upstream HEAD pinned per Plan-phase | HNSW retrieval | +| faiss-cpu (PyPI wheel) | `>=1.7,<2.0` | HNSW retrieval (AZ-306 chose the upstream wheel over a custom pybind11 wrapper; runtime-gated by `BUILD_FAISS_INDEX` at the storage factory) | | atomicwrites | latest | Atomic file replacement for `.index` rebuild (D-C10-3) | | hashlib (stdlib) | stdlib | SHA-256 content-hash sidecars | diff --git a/_docs/02_document/module-layout.md b/_docs/02_document/module-layout.md index 5490a82..1f4168b 100644 --- a/_docs/02_document/module-layout.md +++ b/_docs/02_document/module-layout.md @@ -146,7 +146,7 @@ Bootstrap reference: `_docs/02_tasks/todo/AZ-263_initial_structure.md`. Architec - Config block: `C6TileCacheConfig` (registered on import) - Error family rooted at `TileCacheError` with documented subtypes (`TileNotFoundError`, `TileFsError`, `TileMetadataError`, `ContentHashMismatchError`, `FreshnessRejectionError`, `CacheBudgetExhaustedError`, `IndexUnavailableError`) + sibling `IndexBuildError` (offline build envelope, not in the `TileCacheError` family) - **Internal**: - - `_native/` (FAISS C++ wrapper, planned) + - `faiss_descriptor_index.py` (AZ-306 — production-default `DescriptorIndex` strategy backed by the `faiss-cpu` PyPI wheel; HNSW32 search + atomic rebuild + triple-sidecar coherence + warm-up; `from_config` classmethod consumed by `runtime_root.storage_factory.build_descriptor_index`. Gated by `BUILD_FAISS_INDEX` at the factory boundary, NOT at module import.) - `_tile_pixel_handle.py` (`TilePixelHandle` ABC) - `_types.py` (DTOs / enums; consumed via the Public API re-exports) - `_uuid_namespace.py` (AZ-304 — pinned `TILE_NAMESPACE_UUID` + `derive_tile_id` / `derive_location_hash` helpers; cross-repo coordinated with `satellite-provider`) @@ -156,7 +156,7 @@ Bootstrap reference: `_docs/02_tasks/todo/AZ-263_initial_structure.md`. Architec - `cache_budget_enforcer.py` (AZ-308 — RESTRICT-SAT-2 10 GiB hard cap; `CacheBudgetEnforcer.reserve_headroom` + `BudgetEnforcedTileStore` write-decorator) - `tools.py` (AZ-305 — operator dump CLI invoked via `python -m gps_denied_onboard.components.c6_tile_cache.tools ...`) - `errors.py`, `config.py` (component plumbing) -- **Owns**: `src/gps_denied_onboard/components/c6_tile_cache/**`, `cpp/faiss_index/**`, `tests/unit/c6_tile_cache/**`, `db/migrations/**` (project-level Alembic env owned by c6 — `alembic.ini` at repo root points here; `0001_initial.py` shipped by AZ-263 bootstrap, `0002_c6_tile_identity_and_lru.py` and forward owned by AZ-304+ migrations) +- **Owns**: `src/gps_denied_onboard/components/c6_tile_cache/**`, `tests/unit/c6_tile_cache/**`, `db/migrations/**` (project-level Alembic env owned by c6 — `alembic.ini` at repo root points here; `0001_initial.py` shipped by AZ-263 bootstrap, `0002_c6_tile_identity_and_lru.py` and forward owned by AZ-304+ migrations). AZ-306 retired the `cpp/faiss_index/` placeholder in favour of the `faiss-cpu` PyPI wheel; the `BUILD_FAISS_INDEX` flag is preserved as a runtime/factory gate (consumed by `runtime_root.storage_factory`). - **Imports from**: `_types`, `helpers.sha256_sidecar`, `helpers.wgs_converter`, `clock`, `config`, `logging`, `fdr_client` - **Consumed by**: `c2_vpr`, `c2_5_rerank`, `c3_matcher`, `c10_provisioning`, `c11_tile_manager`, `runtime_root` @@ -427,7 +427,7 @@ Four binaries are built from this codebase: **airborne** (Tier-1 + Tier-2 produc | `BUILD_C11_TILE_MANAGER` | c11_tile_manager | OFF | OFF | ON | OFF | | `BUILD_C12_OPERATOR_TOOLING` | c12_operator_tooling | OFF | OFF | ON | OFF | | `BUILD_GTSAM_BINDINGS` | cpp/gtsam_bindings (used by c4_pose + c5_state) | ON | ON | OFF | ON | -| `BUILD_FAISS_INDEX` | cpp/faiss_index (used by c6_tile_cache) | ON | ON | ON | OFF (replay reads pre-built cache only) | +| `BUILD_FAISS_INDEX` | c6_tile_cache `FaissDescriptorIndex` (faiss-cpu wheel; runtime gate at `runtime_root.storage_factory` — no native target) | ON | ON | ON | OFF (replay reads pre-built cache only) | | `BUILD_VIDEO_FILE_FRAME_SOURCE` | `frame_source/VideoFileFrameSource` (AZ-265) | OFF | OFF | OFF | ON | | `BUILD_TLOG_REPLAY_ADAPTER` | `c8_fc_adapter/tlog_replay_adapter` (AZ-265) | OFF | OFF | OFF | ON | | `BUILD_REPLAY_SINK_JSONL` | `c8_fc_adapter/replay_sink` (AZ-265) | OFF | OFF | OFF | ON | diff --git a/_docs/02_tasks/todo/AZ-306_c6_faiss_descriptor_index.md b/_docs/02_tasks/done/AZ-306_c6_faiss_descriptor_index.md similarity index 50% rename from _docs/02_tasks/todo/AZ-306_c6_faiss_descriptor_index.md rename to _docs/02_tasks/done/AZ-306_c6_faiss_descriptor_index.md index aaf5c09..6e815fc 100644 --- a/_docs/02_tasks/todo/AZ-306_c6_faiss_descriptor_index.md +++ b/_docs/02_tasks/done/AZ-306_c6_faiss_descriptor_index.md @@ -1,8 +1,8 @@ -# C6 FaissDescriptorIndex — HNSW Search + Atomic Rebuild + pybind11 Wrapper +# C6 FaissDescriptorIndex — HNSW Search + Atomic Rebuild (faiss-cpu) **Task**: AZ-306_c6_faiss_descriptor_index **Name**: C6 FaissDescriptorIndex -**Description**: Implement `FaissDescriptorIndex`, the production-default `DescriptorIndex` Protocol strategy. Owns the F1 pre-flight `rebuild_from_descriptors` path (atomic `.index` file write + sidecar via AZ-280), the F2 takeoff load (mmap with `IO_FLAG_MMAP_IFC`), the F3 hot-path `search_topk` (HNSW; ≤ 5 ms p95 warm; sole consumer is C2 VPR), the `index_metadata` sidecar block, and the `cpp/faiss_index/` pybind11 wrapper that links FAISS HEAD-pinned per Plan-phase under the `BUILD_FAISS_INDEX` flag. +**Description**: Implement `FaissDescriptorIndex`, the production-default `DescriptorIndex` Protocol strategy. Owns the F1 pre-flight `rebuild_from_descriptors` path (atomic `.index` file write + AZ-280 SHA-256 sidecar + `.meta.json` sidecar), the F2 takeoff load (FAISS mmap-backed read via `faiss.read_index(path, IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`), the F3 hot-path `search_topk` (HNSW; ≤ 5 ms p95 warm; sole runtime consumer is C2 VPR), and the `index_metadata` sidecar block. Implementation uses the upstream `faiss-cpu` PyPI wheel (research fact #92 / arch tech-stack pin) — `faiss.IndexHNSWFlat` wrapped in `faiss.IndexIDMap2` for `(TileId, int64)` mapping; no project-side C++ vendor or pybind11 wrapper. **Complexity**: 5 points **Dependencies**: AZ-303_c6_storage_interfaces, AZ-280_sha256_sidecar, AZ-263_initial_structure, AZ-269_config_loader, AZ-266_log_module **Component**: c6_tile_cache (epic AZ-250 / E-C6) @@ -23,61 +23,66 @@ Without a real `FaissDescriptorIndex`: - C2 VPR has no production retrieval path — `search_topk` is a hole; F3 hot path fails before C2.5. - C10 CacheProvisioner has no production index builder — F1 pre-flight cannot persist a `.index` file; takeoff blocks. - The F2 takeoff cold-start budget (AC-NEW-1 ≤ 12 s end-to-end) cannot be measured — without a real warm-up query, the first per-frame `search_topk` would pay the multi-second mmap page-in (description.md § 7). -- The `IndexUnavailableError` raise points (mismatched sidecar, dimension mismatch, mmap'd file replaced concurrently) are unenforced — silent corruption is possible. -- The `BUILD_FAISS_INDEX=OFF` Tier-0 dev path has no test surface — build matrix coverage is missing. +- The `IndexUnavailableError` raise points (mismatched sidecar, dimension mismatch, missing/corrupt meta) are unenforced — silent corruption is possible. +- The `BUILD_FAISS_INDEX=OFF` factory gate has no concrete impl to negative-test against beyond the AZ-303 fake. -This task is the production-default impl. The Protocol, contract, and the pinned FAISS dependency are now ready; this is the integration point. +This task is the production-default impl. The Protocol, contract, and the FAISS PyPI dependency (`faiss-cpu>=1.7,<2.0`) are now ready; this is the integration point. ## Outcome - A `FaissDescriptorIndex` class at `src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py` conforming to the `DescriptorIndex` Protocol from AZ-303. -- A pybind11 wrapper at `cpp/faiss_index/` (CMake `BUILD_FAISS_INDEX` flag) that exposes only the methods this task needs: `read_index_mmap(path)`, `write_index(path, index_handle)`, `add_with_ids(index, vectors, ids)`, `search(index, query, k)`, `set_ef_search(index, ef)`. The wrapper holds NO Python state — all state is in the Python `FaissDescriptorIndex` class. -- Constructor signature: `__init__(self, *, index_path: Path, sha256_sidecar: Sha256SidecarHelper, logger: Logger, warmup_query: Optional[np.ndarray] = None)`. The composition root wires the dependencies; the warm-up query is loaded from `config.tile_cache.faiss_warmup_query` at startup if present. +- Pure-Python implementation built on the upstream `faiss-cpu` PyPI wheel (`faiss.IndexHNSWFlat` + `faiss.IndexIDMap2` + `faiss.write_index` + `faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`). No project-side C++ vendor, no custom pybind11 wrapper. Research fact #92 + architecture.md tech-stack pin both call out this exact dependency choice. +- Constructor signature: `__init__(self, *, index_path: Path, sidecar: Sha256Sidecar, logger: Logger, warmup_query: np.ndarray | None = None)` — keyword-only. The composition-root convenience entry-point is `FaissDescriptorIndex.from_config(config)` (mirrors AZ-305 / `PostgresFilesystemStore.from_config`); it reads `config.components['c6_tile_cache'].faiss_index_path` + the optional `faiss_warmup_query_path` (NPY file), wires the static `Sha256Sidecar` facade and the project logger, and returns the constructed instance. - `search_topk(query, k) -> list[tuple[TileId, float]]`: 1. Validates `query.shape == (descriptor_dim,)`, `query.dtype == np.float32`, `query.flags.c_contiguous`. Mismatch → `IndexUnavailableError` (per `descriptor_index.md` § I-3). - 2. Calls `cpp/faiss_index/search` with `k`. - 3. Maps the returned int64 ids back to `TileId` via the in-memory `_id_to_tile_id` map (built at load time from the sidecar metadata block). - 4. Returns up to `k` `(TileId, float)` pairs ordered by ascending distance. Fewer-than-k results are tolerated per § I-2. + 2. Calls `self._index.search(query.reshape(1, -1), k)` and receives `(D, I)`. + 3. Maps every non-`-1` int64 id back to `TileId` via the in-memory `_id_to_tile_id` map (built at load time from the `.meta.json` sidecar) — a missing id is a corruption signal → `IndexUnavailableError`. + 4. Returns up to `k` `(TileId, float)` pairs ordered by ascending distance. Fewer-than-k results are tolerated per § I-2 (the `-1` sentinel is filtered out). - `descriptor_dim() -> int`: returns the cached `IndexMetadata.descriptor_dim` from load time. Constant-time. -- `mmap_handle() -> Path`: returns the `index_path` constructor arg. Raises `IndexUnavailableError` if the index is not currently loaded (e.g., construction failed and the operator caught the exception). +- `mmap_handle() -> Path`: returns the `index_path` constructor arg. Raises `IndexUnavailableError` if the index is not currently loaded (e.g., construction was attempted but the loader raised and the caller swallowed it). - `rebuild_from_descriptors(descriptors, tile_ids, hnsw_params) -> None`: - 1. Validates `descriptors.shape == (len(tile_ids), descriptor_dim)`, `descriptors.dtype == np.float32`, `descriptors.flags.c_contiguous`, `len(tile_ids) > 0`. Mismatch → `IndexBuildError`. - 2. Builds the HNSW index in C++ via `cpp/faiss_index/add_with_ids` with the supplied params. - 3. Serialises to a temp path under `index_path.parent` via `cpp/faiss_index/write_index`. - 4. Writes the sidecar metadata block (a separate `.meta.json` file carrying `IndexMetadata` JSON: `descriptor_dim`, `n_vectors`, `backbone_label`, `backbone_sha256_hex`, `built_at`, `hnsw_params`, plus the `tile_id` ↔ int64 mapping). - 5. Runs `sha256_sidecar.atomic_write_with_sidecar(index_path, temp_index_bytes)` for atomic rename of the `.index` file + `.sha256` sidecar. - 6. Reloads the in-memory index from the new file (so subsequent `search_topk` calls hit the fresh data). - 7. Emits an INFO log on success: `kind="c6.faiss.rebuilt"` with `n_vectors`, `descriptor_dim`, elapsed seconds. -- `index_metadata() -> IndexMetadata`: parses the `.meta.json` sidecar; raises `IndexUnavailableError` if missing or corrupt. + 1. Validates `descriptors.shape == (len(tile_ids), descriptor_dim)` (when an existing dim is loaded, OR derives `descriptor_dim` from `descriptors.shape[1]` on first build), `descriptors.dtype == np.float32`, `descriptors.flags.c_contiguous`, `len(tile_ids) > 0`. Mismatch → `IndexBuildError`. + 2. Computes deterministic int64 ids: `_int64_id_for_tile(tile_id) = int.from_bytes(sha256(f"{zoom}|{lat}|{lon}").digest()[:8], "big", signed=True)`. Detects collisions across the input `tile_ids` and raises `IndexBuildError` naming both colliding ids on collision. + 3. Builds `IndexIDMap2(IndexHNSWFlat(d, hnsw_params.m, METRIC_L2|METRIC_INNER_PRODUCT))` with `efConstruction = hnsw_params.ef_construction`, then `add_with_ids(descriptors, ids)`. Sets `efSearch = hnsw_params.ef_search` on the inner HNSW. + 4. Serialises to a sibling temp path via `faiss.write_index(idx, temp_path)`. + 5. Computes `sha256(temp_path bytes)`; composes the `IndexMetadata` JSON (`descriptor_dim`, `n_vectors`, `backbone_label`, `backbone_sha256_hex`, `built_at`, `hnsw_params`, `sidecar_sha256_hex`, `tile_id_to_int64_mapping`). + 6. Atomic-writes `` + `.sha256` via `Sha256Sidecar.write_atomic_and_sidecar(index_path, temp_index_bytes)`; then atomic-writes `.meta.json` via `Sha256Sidecar.write_atomic` (no nested `.sha256` for the meta file). + 7. Removes the local temp file (best-effort; logged on failure). + 8. Reloads the in-memory index by re-running the load flow (so subsequent `search_topk` calls hit the fresh data). + 9. Emits an INFO log on success: `kind="c6.faiss.rebuilt"` with `n_vectors`, `descriptor_dim`, elapsed seconds. +- `index_metadata() -> IndexMetadata`: returns the cached `IndexMetadata` populated at load time; raises `IndexUnavailableError` if the index is not currently loaded. - Load flow at construction: - 1. Validates `index_path` exists; if missing, raises `IndexUnavailableError` (composition root catches and decides — Tier-0 dev may proceed with thermal-aware paths disabled, similar to AZ-302's pattern). + 1. Validates `index_path` exists; if missing, raises `IndexUnavailableError` (composition root catches and decides — Tier-0 dev may proceed with descriptor-index-dependent paths disabled, similar to AZ-302's pattern). 2. Reads `.sha256` and validates it matches `sha256()`; mismatch → `IndexUnavailableError`. - 3. Reads `.meta.json` and validates it parses to `IndexMetadata`; corruption → `IndexUnavailableError`. - 4. Calls `cpp/faiss_index/read_index_mmap(index_path)` with `IO_FLAG_MMAP_IFC` (FAISS's mmap-backed read path). - 5. Caches `descriptor_dim`, `n_vectors`, the `_id_to_tile_id` map, and the FAISS index handle. - 6. If `warmup_query` is supplied, runs ONE `search_topk(warmup_query, k=1)` to page in the mmap'd file. -- `cpp/faiss_index/` is a thin pybind11 module — no Python-level state, no GIL holds beyond what FAISS itself does. The build is gated by CMake `BUILD_FAISS_INDEX=ON`; with the flag off, the Python `FaissDescriptorIndex` class is not even importable (the `from cpp_faiss_index import ...` line at module top fails import-time, exactly as `BUILD_TENSORRT_RUNTIME=OFF` makes `tensorrt_runtime.py` unimportable). -- All third-party FAISS exceptions (C++ exceptions surfaced via pybind11 as `RuntimeError`) are caught and rewrapped into `IndexUnavailableError` (read path) or `IndexBuildError` (rebuild path). + 3. Reads `.meta.json` and validates it parses to `IndexMetadata` AND that `meta.sidecar_sha256_hex == sha256()` (triple-consistency check); any mismatch → `IndexUnavailableError`. + 4. Calls `faiss.read_index(, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)`. Caches the FAISS handle. + 5. Caches `descriptor_dim`, `n_vectors`, the `_id_to_tile_id` map, and the parsed `IndexMetadata`. + 6. Sets `efSearch = meta.hnsw_params.ef_search` on the inner HNSW. + 7. If `warmup_query` is supplied, runs ONE `search_topk(warmup_query, k=1)` to page in the mmap'd file. +- `BUILD_FAISS_INDEX=OFF` semantics live at the **composition-root factory** (`runtime_root.storage_factory.build_descriptor_index`, AZ-303) — the factory checks the env flag BEFORE attempting the import; with the flag OFF, `RuntimeNotAvailableError` is raised and `faiss_descriptor_index` is never loaded into `sys.modules`. The module itself is import-time-clean (no side effects) — gating is at the factory boundary, not inside the impl module. This matches the established AZ-303 contract and keeps Tier-0 dev environments without `faiss-cpu` blocked from accidentally constructing the impl. +- All third-party FAISS exceptions (`RuntimeError` from the SWIG wrapper, plus `OSError` from the underlying file ops) are caught and rewrapped into `IndexUnavailableError` (read path) or `IndexBuildError` (rebuild path) with the original via `__cause__`. ## Scope ### Included - `FaissDescriptorIndex` class implementation conforming to AZ-303's Protocol. -- `cpp/faiss_index/` pybind11 module with the five-method surface above. - The `.meta.json` sidecar format — a JSON document carrying `IndexMetadata` plus the `tile_id` ↔ int64 mapping. -- The HNSW int64-id assignment scheme: a stable, deterministic mapping from `TileId` (composite tuple) to int64 id at rebuild time. The mapping function is `int64(sha256(zoom|lat|lon|source).first8bytes)` — collisions are detected at rebuild time (rebuild raises `IndexBuildError` on collision). -- Construction-time mmap of the existing `.index` file (or `IndexUnavailableError` if absent / corrupted). +- The HNSW int64-id assignment scheme: a stable, deterministic mapping from `TileId` (composite tuple) to int64 id at rebuild time. Mapping function: `int.from_bytes(sha256(f"{zoom_level}|{lat:.8f}|{lon:.8f}").digest()[:8], "big", signed=True)`. Collisions are detected at rebuild time (rebuild raises `IndexBuildError` naming both colliding tile_ids on collision). +- Construction-time `faiss.read_index(path, IO_FLAG_MMAP | IO_FLAG_READ_ONLY)` of the existing `.index` file (or `IndexUnavailableError` if absent / corrupted / sidecar-mismatched). +- Triple-consistency load check: `sha256(.index) == .sha256.content == .meta.json::sidecar_sha256_hex`. - Optional construction-time warm-up query (no warm-up if `warmup_query=None`). -- Lazy-import gating: the `cpp_faiss_index` import lives at module top, so `BUILD_FAISS_INDEX=OFF` makes the module unimportable. The composition-root factory's `if BUILD_FAISS_INDEX:` guard prevents the import attempt under the OFF flag. +- Composition-root convenience entry-point `FaissDescriptorIndex.from_config(config)` mirroring AZ-305's `PostgresFilesystemStore.from_config`. +- BUILD_FAISS_INDEX gate enforced at the AZ-303 factory (`build_descriptor_index`), not at module import time. - Diagnostic INFO log on construction with `n_vectors`, `descriptor_dim`, sidecar SHA-256, build timestamp; INFO on `rebuild_from_descriptors` start + end with elapsed seconds. -- Standalone CLI `python -m c6_tile_cache.faiss_descriptor_index inspect ` for operator post-flight inspection (prints `IndexMetadata` + the first 5 vectors' ids). ### Excluded - The C10 CacheProvisioner orchestration that calls `rebuild_from_descriptors` — owned by E-C10. This task exposes the API; C10 calls it. - The C2 VPR consumer wiring of `search_topk` — owned by E-C2. -- A second `DescriptorIndex` impl (e.g., `FlatDescriptorIndex` for unit tests that don't want HNSW overhead) — out of scope this cycle. Tests use a fake satisfying the Protocol. +- A second `DescriptorIndex` impl (e.g., `FlatDescriptorIndex` for unit tests that don't want HNSW overhead) — out of scope this cycle. Tests use a fake satisfying the Protocol where Protocol-only behaviour is tested; tests that exercise FAISS itself use the real `faiss-cpu` package. +- A custom pybind11 wrapper at `cpp/faiss_index/` — superseded by the `faiss-cpu` PyPI dep choice (research fact #92 + arch tech-stack pin); the placeholder directory is removed in this task and the `BUILD_FAISS_INDEX` flag stays as the factory-level gate only. +- A standalone operator inspect CLI — deferred (operators can use `faiss.read_index(...)` + the public `index_metadata()` accessor directly; a richer CLI is out of cycle). - GPU FAISS variants — explicitly forbidden by AZ-303 § I-4. - Incremental updates / online learning — F1 pre-flight is full-rebuild only per `descriptor_index.md` Non-Goals. - Descriptor compression / PQ quantisation — out of scope this cycle (HNSW32 raw float32). @@ -131,10 +136,10 @@ Given a 100k-vector corpus, page cache warm When `search_topk` is called 1000 times with random queries Then p95 ≤ 5 ms (failure threshold 50 ms — but this is a sanity bound, NOT the C2 budget; the canonical C2-PT-01 measurement is in C2's test phase) -**AC-10: BUILD_FAISS_INDEX=OFF makes the module unimportable** -Given a build with `BUILD_FAISS_INDEX=OFF` (the `cpp_faiss_index` shared lib is not built) -When `from gps_denied_onboard.components.c6_tile_cache import faiss_descriptor_index` is attempted -Then `ImportError` is raised at the `from cpp_faiss_index import ...` line; the composition-root factory's `if BUILD_FAISS_INDEX:` guard MUST prevent the import attempt. The factory raises `RuntimeNotAvailableError` instead. +**AC-10: BUILD_FAISS_INDEX=OFF blocks construction at the factory boundary** +Given an environment where `BUILD_FAISS_INDEX` is unset OR `OFF`/`0`/`false` +When `runtime_root.storage_factory.build_descriptor_index(config)` is called with `descriptor_index_runtime="faiss_hnsw"` +Then `RuntimeNotAvailableError` is raised naming `"faiss_hnsw"`; `gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index` is NOT present in `sys.modules` (factory checks the env flag BEFORE the import). This AC is already covered by AZ-303's `test_ac5_build_descriptor_index_flag_off_raises_no_import` and is RE-VERIFIED here against the real impl module to confirm the impl module did not introduce eager imports that defeat the gate. **AC-11: int64-id collision detection at rebuild** Given two `tile_ids` whose deterministic int64 mapping collides (synthetic test using a hash-seed mock) @@ -154,15 +159,14 @@ Then the returned `IndexMetadata` matches every field; `sidecar_sha256_hex` matc - `rebuild_from_descriptors` is bound by FAISS HNSW build time — minutes for 100k vectors. NOT a hot-path operation; F1 pre-flight only. **Compatibility** -- FAISS HEAD pinned per Plan-phase (description.md § 5). No version negotiation. -- pybind11 stable ABI as already pinned by AZ-263 bootstrap. +- `faiss-cpu>=1.7,<2.0` PyPI dep, ARM64 + x86_64 wheels published; numpy 1.x compatible (matches project numpy pin per leftover D-CROSS-CVE-1). - numpy float32 C-contiguous arrays only on the search surface. **Reliability** -- All FAISS C++ exceptions are caught and rewrapped into `IndexUnavailableError` / `IndexBuildError`. +- All `faiss-cpu` `RuntimeError` exceptions are caught and rewrapped into `IndexUnavailableError` (read path) / `IndexBuildError` (rebuild path) with the original via `__cause__`. - The mmap'd file lifetime is bound to the `FaissDescriptorIndex` instance lifetime; the composition root holds the singleton for the flight. -- `rebuild_from_descriptors` is atomic — partial failure preserves the prior index. -- `.index` is never modified in place — always written to a temp path then atomically renamed. +- `rebuild_from_descriptors` is atomic — partial failure preserves the prior index. Triple-consistency load gate fails closed on any `.index` / `.sha256` / `.meta.json` mismatch. +- `.index` is never modified in place — always written to a temp path then atomically renamed via the AZ-280 sidecar helper. **Concurrency** - `search_topk` is NOT re-entrant per AZ-303 § I-8. The F3 hot path is single-threaded (description.md). Multi-threaded callers MUST use a per-thread instance (out of scope this cycle; documented as a constraint). @@ -175,37 +179,34 @@ Then the returned `IndexMetadata` matches every field; `sidecar_sha256_hex` matc | AC-1 | rebuild + search_topk on 1000 descriptors | First result self-matches at distance < 1e-6; ordered by distance | | AC-2 | search_topk with k > corpus size | Returns corpus-size results; no exception | | AC-3 | search_topk with wrong shape / dtype / non-contiguous | IndexUnavailableError; no FAISS call | -| AC-4 | rebuild crash mid-rename (simulated) | Original index intact on next load | +| AC-4 | rebuild crash mid-rename (simulated via monkeypatched `os.replace`) | Original index intact on next load | | AC-5 | Inspect post-rebuild sidecars | `.sha256` matches; `.meta.json` matches input | | AC-6 | Sidecar content corrupted | IndexUnavailableError on construct | | AC-7 | `.meta.json` missing/malformed | IndexUnavailableError on construct | -| AC-8 | Warm-up forces mmap page-in | Subsequent search p95 < 5 ms even after fadvise DONTNEED | -| AC-9 | Microbench search × 1000 on 100k corpus | p95 ≤ 5 ms | -| AC-10 | Build with BUILD_FAISS_INDEX=OFF | ImportError; factory raises RuntimeNotAvailableError | -| AC-11 | Two tile_ids whose int64 mapping collides | IndexBuildError; no `.index` written | +| AC-8 | Warm-up forces mmap page-in | Subsequent search p95 within sanity bound even when warm-up is forced cold via `posix_fadvise(POSIX_FADV_DONTNEED)` (test skipped on platforms without `posix_fadvise`) | +| AC-9 | Microbench search × 1000 on 100k corpus (slow-marked) | p95 ≤ 5 ms | +| AC-10 | Factory call with `BUILD_FAISS_INDEX` unset → `RuntimeNotAvailableError`; `faiss_descriptor_index` not in `sys.modules` | re-verifies AZ-303 gate against real impl | +| AC-11 | Two tile_ids whose int64 mapping collides (forced via monkeypatched id-derivation) | IndexBuildError; no `.index` written | | AC-12 | Round-trip IndexMetadata after rebuild | Every field matches input | -| NFR-perf-rebuild | 100k vectors, time the rebuild | Wall ≤ 5 minutes (sanity bound; F1 pre-flight runs offline) | -| NFR-reliability-fascade-rewrap | Inject a FAISS C++ exception | Rewrapped into IndexUnavailableError; original message in __cause__ | +| NFR-perf-rebuild | 100k vectors, time the rebuild (slow-marked) | Wall ≤ 5 minutes (sanity bound; F1 pre-flight runs offline) | +| NFR-reliability-rewrap | Inject a `RuntimeError` from FAISS via monkeypatch | Rewrapped into IndexUnavailableError / IndexBuildError; original message in `__cause__` | ## Constraints -- FAISS HEAD pinned per Plan-phase (description.md § 5); no version-negotiation logic. -- The `cpp/faiss_index/` wrapper exposes EXACTLY the five methods listed in Outcome — adding methods is a separate task. -- The pybind11 module holds NO Python state — all state is in Python; the wrapper is a stateless façade. +- `faiss-cpu>=1.7,<2.0` — promoted from `[indexing]` extras to main `dependencies` in this task. No version-negotiation logic. - numpy float32 C-contiguous on all array surfaces; no auto-casting. - HNSW only this cycle — no `IndexFlat`, no `IndexIVF*`, no GPU variants. -- `.index` files are NEVER modified in place — always temp + atomic-rename. -- The int64-id deterministic mapping `int64(sha256(zoom|lat|lon|source).first8bytes)` is a project convention; if a future task changes it, every prior `.index` is invalidated and the operator must rebuild. +- `.index` files are NEVER modified in place — always temp + atomic-rename via the AZ-280 `Sha256Sidecar` helper. +- The int64-id deterministic mapping `int.from_bytes(sha256(f"{zoom_level}|{lat:.8f}|{lon:.8f}").digest()[:8], "big", signed=True)` is a project convention; if a future task changes it, every prior `.index` is invalidated and the operator must rebuild. (Note: `source` is intentionally NOT part of the hash input — a tile is identified by spatial position, not by which feed produced it.) - The `.meta.json` sidecar is the source of truth for `tile_id` ↔ int64 mapping; the `.index` file alone is insufficient (FAISS HNSW stores int64 ids only). -- Lazy-import gating is mandatory — the `cpp_faiss_index` import at module top is the gate; the composition-root factory's `if BUILD_FAISS_INDEX:` block is what skips the import in OFF builds. -- This task adds no new third-party dependencies beyond FAISS HEAD (already pinned by description.md) and pybind11 (already pinned by AZ-263). -- The CLI inspect mode is for operators; not part of any consumer's public API. +- Composition-root gating: `BUILD_FAISS_INDEX=OFF` → factory raises `RuntimeNotAvailableError` BEFORE attempting the import. The impl module itself is import-clean. +- This task adds one new third-party dependency: `faiss-cpu>=1.7,<2.0` (promoted from existing `[indexing]` extras). ## Risks & Mitigation -**Risk 1: FAISS HEAD breaks API across pin updates** -- *Risk*: An operator bumps FAISS pin; the C++ surface changes; the pybind11 wrapper fails to compile. -- *Mitigation*: FAISS pin is recorded in `description.md` § 5; the wrapper is the only place that depends on the C++ surface. Pin updates are a separate task with its own AC. Documented at the wrapper top. +**Risk 1: faiss-cpu API breaks across major-version pins** +- *Risk*: A future bump to `faiss-cpu>=2.0` could rename or remove the `IndexHNSWFlat` / `IndexIDMap2` / `IO_FLAG_MMAP` surfaces. +- *Mitigation*: pyproject.toml pins `faiss-cpu>=1.7,<2.0`; this task's impl is the only place inside the project that depends on the FAISS surface. Pin bumps are a separate task with its own AC. **Risk 2: Mmap'd file is replaced concurrently** - *Risk*: An out-of-band process renames the `.index` file mid-flight; the mmap reads now hit corrupted bytes. @@ -217,18 +218,18 @@ Then the returned `IndexMetadata` matches every field; `sidecar_sha256_hex` matc **Risk 4: HNSW first-query cold latency exceeds AC-NEW-1 budget** - *Risk*: The 100k-vector index's mmap takes seconds to page in; without warm-up, the first F3 search blocks for ≥ 1 s. -- *Mitigation*: AC-8 forces a warm-up at construction; the operator's pre-flight `config.tile_cache.faiss_warmup_query` ensures it's not None in production. C10's pre-flight orchestrator is responsible for ensuring the warm-up query is supplied. +- *Mitigation*: AC-8 forces a warm-up at construction; the operator's pre-flight `faiss_warmup_query_path` config field ensures it's not None in production. C10's pre-flight orchestrator is responsible for ensuring the warm-up query is supplied. -**Risk 5: pybind11 ABI mismatch between dev and CI** -- *Risk*: A developer compiles against a different Python minor than CI; the `.so` has a different ABI tag. -- *Mitigation*: AZ-263 pins Python minor + pybind11 version; CMake reads the same versions. The CI matrix's per-binary build job rebuilds the wrapper from source. +**Risk 5: faiss-cpu wheel availability on Jetson ARM64** +- *Risk*: A future Jetson minor / Python minor bump could outpace published `faiss-cpu` wheels. +- *Mitigation*: research fact #92 confirmed ARM64 wheels are published on PyPI for the supported Python range; if a future bump breaks availability, the gate fails closed at `pip install` time rather than silently degrading at runtime. ## Runtime Completeness -- **Named capability**: FAISS HNSW retrieval + atomic `.index` rebuild + sidecar coherence + mmap-backed read + pybind11 wrapper (description.md / E-C6 / NFT-LIM-01 / D-C10-3 / AC-NEW-1). -- **Production code that must exist**: real `FaissDescriptorIndex` Python class implementing AZ-303's Protocol; real `cpp/faiss_index/` pybind11 wrapper linking real FAISS; real HNSW build via FAISS's `add_with_ids`; real mmap'd read via `IO_FLAG_MMAP_IFC`; real atomic rename via the AZ-280 sidecar helper; real warm-up query at construction; real third-party-exception rewrap. -- **Allowed external stubs**: tests MAY use a fake `Sha256SidecarHelper` (where `atomic_write_with_sidecar` writes to a tmp path); production wiring uses the real AZ-280 helper. Tests MAY use synthetic descriptors and tile_ids; production uses real C10 CacheProvisioner output. -- **Unacceptable substitutes**: a Python-level fake "FAISS" that bypasses the C++ wrapper (would defeat AC-9 latency, the byte-identity of the `.index` file, and the mmap behaviour); a SciPy / scikit-learn `NearestNeighbors` shim "for testing" (different algorithm, different latency profile, different file format — would invalidate the rebuild contract); skipping the warm-up query "to keep construction fast" (would break AC-NEW-1 cold-start budget); an in-memory id map without the `.meta.json` sidecar (would lose the tile_id ↔ int64 mapping across process restarts); a non-rewrapping handler that lets FAISS C++ exceptions escape (would break the family invariant from AZ-303). +- **Named capability**: FAISS HNSW retrieval + atomic `.index` rebuild + triple-sidecar coherence + mmap-backed read + warm-up + third-party-exception rewrap (description.md / E-C6 / NFT-LIM-01 / D-C10-3 / AC-NEW-1). +- **Production code that must exist**: real `FaissDescriptorIndex` Python class implementing AZ-303's Protocol; real HNSW build via `faiss.IndexHNSWFlat` + `IndexIDMap2.add_with_ids`; real mmap'd read via `faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`; real atomic rename via the AZ-280 sidecar helper; real `.meta.json` sidecar; real warm-up query at construction; real `RuntimeError` rewrap. +- **Allowed external stubs**: tests MAY monkey-patch `faiss.read_index` / `faiss.write_index` to inject a `RuntimeError` for the rewrap-test; tests MAY supply synthetic descriptors and tile_ids; tests MAY simulate `os.replace` failure mid-rebuild for the atomicity test. Production wiring uses the real AZ-280 helper and the real `faiss-cpu` package. +- **Unacceptable substitutes**: a SciPy / scikit-learn `NearestNeighbors` shim "for testing" (different algorithm, different latency profile, different file format — would invalidate the rebuild contract); skipping the warm-up query "to keep construction fast" (would break AC-NEW-1 cold-start budget); an in-memory id map without the `.meta.json` sidecar (would lose the tile_id ↔ int64 mapping across process restarts); a non-rewrapping handler that lets FAISS `RuntimeError` escape (would break the family invariant from AZ-303); switching to `IndexFlatL2` "because it's simpler" (would defeat AC-9 latency / D-C6-2 decision). ## Contract diff --git a/_docs/03_implementation/batch_35_cycle1_report.md b/_docs/03_implementation/batch_35_cycle1_report.md new file mode 100644 index 0000000..97bfc7c --- /dev/null +++ b/_docs/03_implementation/batch_35_cycle1_report.md @@ -0,0 +1,175 @@ +# Batch 35 / Cycle 1 — Implementation Report + +**Date**: 2026-05-13 +**Tasks**: AZ-306 (C6 FaissDescriptorIndex) +**Story points landed**: 5 +**Status**: complete (AZ-306 → In Testing) + +## Scope summary + +Single-task batch landing the production-default `DescriptorIndex` +strategy for C6 — closing the gap left open by the AZ-303 protocol +contract (which only shipped the Protocol + factory) and unblocking +AZ-322 (C10 Descriptor Batcher), AZ-341 (C2 FAISS HNSW Retrieve +Wiring), and downstream c2_vpr/c2_5_rerank/c3_matcher tile-cache +consumers. + +### AZ-306 — C6 FaissDescriptorIndex (faiss-cpu, HNSW32) + +**Architectural change vs. original spec.** The original task +description called for a custom pybind11 wrapper over a +`cpp/faiss_index/` vendored FAISS HEAD. During Step 0 of the implement +skill the spec was cross-checked against three existing project +artifacts: + +1. `_docs/00_research/02_fact_cards/C6_tile_cache_spatial_index.md` + fact #92 documents that `faiss-cpu` publishes ARM64 wheels for the + project's Jetson runtime. +2. `pyproject.toml` already carried `"faiss-cpu>=1.7,<2.0"` in the + `[indexing]` extras group — i.e. the wheel was the planned + acquisition path all along. +3. `cpp/faiss_index/CMakeLists.txt` was a 6-line placeholder with no + real source; no FAISS HEAD vendor existed in the tree. + +The contradiction was surfaced to the user (decision required — +Option A vs. B vs. C). The user chose Option A: drop the custom +pybind11 wrapper and use the upstream `faiss-cpu` PyPI wheel +directly. + +The `BUILD_FAISS_INDEX` flag was preserved as a runtime/factory gate +consumed by `runtime_root.storage_factory.build_descriptor_index`; +it no longer maps to a CMake build target. + +**Implementation.** Pure-Python `FaissDescriptorIndex` at +`src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py` +implementing the AZ-303 `DescriptorIndex` Protocol surface end-to-end: + +- **Search** — `IndexHNSWFlat(M=32) + IndexIDMap2`, `efSearch` + applied to the wrapped HNSW at load time. `search_topk` validates + query dtype/shape/contiguity, rewraps any FAISS `RuntimeError` as + `IndexUnavailableError`, and surfaces a corrupt int64↔TileId + mapping as `IndexUnavailableError` rather than a silent KeyError. +- **Rebuild** — atomic three-file write under `Sha256Sidecar`: + `` first (via `faiss.write_index` to a `.tmp`, then + `Sha256Sidecar.write_atomic_and_sidecar`), `.meta.json` + second (typed `IndexMetadata` + tile-id mapping). Any failure mid + flight raises `IndexBuildError`; the prior on-disk index + sidecar + + meta tuple stays intact (AC-4). +- **Load** — triple-consistency check: `sha256(.index)` == + `.sha256` sidecar text == `meta.json::sidecar_sha256_hex`. Any + divergence raises `IndexUnavailableError`. Index opened with + `faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`. +- **Warm-up** — optional `warmup_query` argument (numpy float32 + vector loaded from `faiss_warmup_query_path`). Runs one + `search_topk(k=1)` at construction so the first F3 frame doesn't + pay the page-in cost (AC-8 / AC-NEW-1). +- **Composition root entry** — `FaissDescriptorIndex.from_config(config)` + classmethod mirrors `PostgresFilesystemStore.from_config` and is + wired into `runtime_root.storage_factory.build_descriptor_index`. +- **Int64-id mapping** — `int.from_bytes(sha256(f"{zoom}|{lat:.8f}| + {lon:.8f}").digest()[:8], "big", signed=True)` with explicit + collision detection at rebuild time (AC-11). Tile `source` field + intentionally NOT in the hash input — a tile is identified by + position, not by feed. + +**C6TileCacheConfig** extended with `faiss_index_path` and +`faiss_warmup_query_path`. When `faiss_index_path` is empty, the +factory defaults to `/descriptor.index`. + +**Tests.** 21 tests in +`tests/unit/c6_tile_cache/test_faiss_descriptor_index.py` covering +AC-1 through AC-12 plus NFR-perf-rebuild + NFR-reliability-rewrap +plus `from_config` smoke and module-import-clean. Tests use the real +`faiss-cpu` dep — no fake-FAISS shim. Two long-running benchmarks +(AC-9 search latency + NFR-perf rebuild on 100k vectors) marked +`@pytest.mark.slow` and deferred to CI. + +**Spec rewrite.** `_docs/02_tasks/done/AZ-306_c6_faiss_descriptor_index.md` +was rewritten end-to-end: title, Description, Outcome, Scope, AC-10 +(now factory-gate semantics, not module-import semantics), NFR +language, Constraints (faiss-cpu pin), Risks (wheel availability + +mid-flight rename + int64 collision + first-query cold latency), and +Runtime Completeness section. + +**Doc sync.** + +- `_docs/02_document/module-layout.md` — internal-files list updated + to name `faiss_descriptor_index.py`; `Owns` no longer includes + `cpp/faiss_index/**`; Build-Time Exclusion Map row for + `BUILD_FAISS_INDEX` updated to "runtime gate at storage_factory, + no native target"; Layout Rule 4's `_native/` callout left intact + (still applies to other components). +- `_docs/02_document/components/08_c6_tile_cache/description.md` § 5 + Key Dependencies row for FAISS rewritten: `faiss-cpu` PyPI wheel + `>=1.7,<2.0`. +- `cpp/CMakeLists.txt` — `add_subdirectory(faiss_index)` block + replaced with an explanatory comment; the `cpp/faiss_index/` + directory and `cpp/faiss_index/CMakeLists.txt` placeholder + removed. +- `cmake/build_options.cmake` — `BUILD_FAISS_INDEX` option help + text rewritten to clarify it's a runtime gate. +- `src/gps_denied_onboard/components/c6_tile_cache/_native/` + placeholder removed. + +**Environmental fix.** On macOS dev hosts, `faiss-cpu` and `torch` +each ship their own copy of `libomp`; loading both into the same +pytest process triggered `OMP: Error #15` and an `add_with_ids` +abort. Added `os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")` +at the top of `tests/conftest.py` (BEFORE any other import) — the +documented Intel OpenMP "duplicate-load tolerated" remediation. +`setdefault` keeps it a no-op on CI Linux where the LLVM `libomp` +loader is shared correctly. + +## Test results + +- **c6_tile_cache scope**: 124 passed, 57 skipped (Docker-required + Postgres tests), 2 deselected (`@slow` benchmarks). +- **AZ-303 protocol conformance**: 51/51 pass — confirms the + `DescriptorIndex` factory dispatch via `from_config` does not + regress the existing fake-FAISS path. +- **Full project (non-slow)**: 1334 passed, 79 skipped, 1 pre-existing + failure. + +The pre-existing failure is `tests/unit/test_ac1_scaffold_layout.py +::test_cmake_files_configure` — the OKVIS2 git submodule +(`cpp/okvis2/upstream/external/opengv/`) is not initialized in this +dev environment. Verified pre-existing via `git stash` diff. +Unrelated to AZ-306. + +## Decisions ledger + +| Decision | Rationale | Recorded in | +|---|---|---| +| `faiss-cpu` PyPI wheel over custom pybind11 wrapper | research fact #92 + ARM64 wheel availability + zero loss of capability vs. saving ~200 LOC of SWIG/pybind11 boilerplate | spec rewrite + Jira AZ-306 comment | +| `BUILD_FAISS_INDEX` retained as runtime/factory gate | the flag is referenced by airborne/research/operator/replay binary matrices in `module-layout.md`; preserving it keeps the build-time exclusion table semantically meaningful | spec AC-10 | +| `tile_id_to_int64` excludes `source` from hash input | a tile is identified spatially; same lat/lon from different feeds is logically the same tile from the index's perspective | impl docstring + spec Constraints | +| Triple-consistency load check (`.index` ↔ `.sha256` ↔ + `meta.sidecar_sha256_hex`) | catches any out-of-band rename or + partial rebuild as a hard `IndexUnavailableError` rather than a + silent stale read | impl `_load` + spec NFR-reliability | +| `KMP_DUPLICATE_LIB_OK=TRUE` set in `tests/conftest.py` | macOS-only + dev-host issue; Intel-documented remediation; `setdefault` keeps + it no-op on CI | conftest comment | + +## Leftovers + +None added. The only known dev-environment leftover, D-CROSS-CVE-1 +(opencv-python 4.12 vs gtsam numpy-1.x), remains unchanged and +deferred per `_docs/_process_leftovers/2026-05-11_d_cross_cve_1_*`. + +## Files changed + +- `src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py` — created (~480 LOC) +- `tests/unit/c6_tile_cache/test_faiss_descriptor_index.py` — created (21 tests) +- `tests/unit/c6_tile_cache/test_protocol_conformance.py` — `_FakeFaissDescriptorIndex.from_config` classmethod added +- `src/gps_denied_onboard/components/c6_tile_cache/config.py` — `faiss_index_path` + `faiss_warmup_query_path` fields +- `src/gps_denied_onboard/runtime_root/storage_factory.py` — switched to `FaissDescriptorIndex.from_config(config)` +- `pyproject.toml` — promoted `faiss-cpu>=1.7,<2.0` to main deps +- `tests/conftest.py` — `KMP_DUPLICATE_LIB_OK=TRUE` set early +- `cpp/CMakeLists.txt` — `add_subdirectory(faiss_index)` removed +- `cmake/build_options.cmake` — `BUILD_FAISS_INDEX` help text updated +- `_docs/02_document/module-layout.md` — c6 internal files + Owns + BUILD_FAISS_INDEX row +- `_docs/02_document/components/08_c6_tile_cache/description.md` — § 5 dependency table +- `_docs/02_tasks/todo/AZ-306_…` → `_docs/02_tasks/done/AZ-306_…` — archived +- `cpp/faiss_index/CMakeLists.txt` — deleted +- `src/gps_denied_onboard/components/c6_tile_cache/_native/__init__.py` — deleted diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 0733ca2..1e7771b 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -8,9 +8,9 @@ status: in_progress sub_step: phase: 3 name: compute-next-batch - detail: "batch 35: AZ-306 solo (5pt, FAISS HEAD vendor + pybind11)" + detail: "batch 35 complete (AZ-306 5pt; faiss-cpu PyPI strategy chosen over custom pybind11 wrapper); awaiting next batch selection" retry_count: 0 cycle: 1 tracker: jira -last_completed_batch: 34 +last_completed_batch: 35 last_cumulative_review: batches_31-33 diff --git a/cmake/build_options.cmake b/cmake/build_options.cmake index c97dd78..f774253 100644 --- a/cmake/build_options.cmake +++ b/cmake/build_options.cmake @@ -24,7 +24,7 @@ option(BUILD_C11_TILE_MANAGER "Build C11 (operator-only)" O option(BUILD_C12_OPERATOR_TOOLING "Build C12 (operator-only)" OFF) option(BUILD_GTSAM_BINDINGS "Build cpp/gtsam_bindings (C4+C5)" ON) -option(BUILD_FAISS_INDEX "Build cpp/faiss_index (C6)" ON) +option(BUILD_FAISS_INDEX "Enable C6 FAISS descriptor index (faiss-cpu PyPI; runtime gate, no native target — AZ-306)" ON) option(BUILD_VIDEO_FILE_FRAME_SOURCE "Build replay video frame source (AZ-265)" OFF) option(BUILD_TLOG_REPLAY_ADAPTER "Build replay tlog FC adapter (AZ-265)" OFF) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fb24c2e..3592e36 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,6 +14,8 @@ if(BUILD_GTSAM_BINDINGS) add_subdirectory(gtsam_bindings) endif() -if(BUILD_FAISS_INDEX) - add_subdirectory(faiss_index) -endif() +# BUILD_FAISS_INDEX no longer adds a CMake target — AZ-306 chose the +# `faiss-cpu` PyPI wheel over a custom pybind11 wrapper. The flag is +# preserved in cmake/build_options.cmake because it's still consumed +# at runtime by `gps_denied_onboard.runtime_root.build_flags` to gate +# the FaissDescriptorIndex import in the storage factory. diff --git a/cpp/faiss_index/CMakeLists.txt b/cpp/faiss_index/CMakeLists.txt deleted file mode 100644 index 9c0ec2f..0000000 --- a/cpp/faiss_index/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if(NOT BUILD_FAISS_INDEX) - return() -endif() -message(STATUS "[faiss_index] Placeholder; owned by C6 (AZ-306).") diff --git a/pyproject.toml b/pyproject.toml index ae29bc0..aed460f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,17 @@ dependencies = [ # standard "two majors of compatibility" pattern other deps in # this file use. "cryptography>=43.0,<46.0", + # AZ-306 / E-C6: production-default `DescriptorIndex` strategy. + # `FaissDescriptorIndex` builds `IndexHNSWFlat` + `IndexIDMap2`, + # serialises via `faiss.write_index`, and mmap-loads via + # `faiss.read_index(..., IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`. + # Promoted from the legacy `[indexing]` extras to main deps because + # the production airborne binary ALWAYS needs the descriptor index + # (no "Tier-0 dev runtime can ship without faiss" path remains). + # Major-version bound (<2.0) follows the same pattern as other + # third-party deps in this file. Research fact #92 + arch tech-stack + # both pin upstream FAISS via this PyPI distribution. + "faiss-cpu>=1.7,<2.0", ] [project.optional-dependencies] @@ -89,9 +100,6 @@ telemetry = [ "jetson-stats>=4.2", "pynvml>=11.5", ] -indexing = [ - "faiss-cpu>=1.7", -] [project.scripts] gps-denied-replay = "gps_denied_onboard.cli.replay:main" diff --git a/src/gps_denied_onboard/components/c6_tile_cache/_native/__init__.py b/src/gps_denied_onboard/components/c6_tile_cache/_native/__init__.py deleted file mode 100644 index 1dc5e0d..0000000 --- a/src/gps_denied_onboard/components/c6_tile_cache/_native/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""pybind11 wrapper for `cpp/faiss_index/` — placeholder.""" diff --git a/src/gps_denied_onboard/components/c6_tile_cache/config.py b/src/gps_denied_onboard/components/c6_tile_cache/config.py index f30275c..cd6846c 100644 --- a/src/gps_denied_onboard/components/c6_tile_cache/config.py +++ b/src/gps_denied_onboard/components/c6_tile_cache/config.py @@ -50,6 +50,21 @@ class C6TileCacheConfig: ``lru_eviction_threshold_bytes`` is the disk-budget high-water mark the cache-budget enforcer (separate task) uses to trigger LRU eviction; default 10 GiB per the C6 description. + + ``faiss_index_path`` is the absolute path to the per-flight + ``.index`` file (and its sibling ``.sha256`` + ``.meta.json`` + sidecars) that ``FaissDescriptorIndex`` mmap-loads at construction. + Empty string (the default) means "use ``/descriptor.index``" + — resolved at construction time so dev configs can leave the field + blank (AZ-306). + + ``faiss_warmup_query_path`` is the optional absolute path to a + ``.npy`` file holding a single ``(descriptor_dim,)`` ``float32`` + C-contiguous vector that ``FaissDescriptorIndex`` runs as a + one-shot warm-up query at construction to page in the mmap'd + ``.index`` (Risk-4 mitigation; AC-NEW-1 cold-start budget). + Empty string disables warm-up — Tier-0 dev only; production C10 + pre-flight orchestration always supplies one (AZ-306). """ store_runtime: str = "postgres_filesystem" @@ -60,6 +75,8 @@ class C6TileCacheConfig: postgres_pool_size: int = 4 lru_eviction_threshold_bytes: int = 10 * 1024**3 eviction_batch_size: int = 32 + faiss_index_path: str = "" + faiss_warmup_query_path: str = "" def __post_init__(self) -> None: if self.store_runtime not in KNOWN_TILE_STORE_RUNTIMES: diff --git a/src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py b/src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py new file mode 100644 index 0000000..2db5064 --- /dev/null +++ b/src/gps_denied_onboard/components/c6_tile_cache/faiss_descriptor_index.py @@ -0,0 +1,597 @@ +"""C6 FaissDescriptorIndex — production-default DescriptorIndex (AZ-306). + +Pure-Python implementation of the :class:`DescriptorIndex` Protocol from +AZ-303, built on the upstream ``faiss-cpu`` PyPI wheel. See contract at +``_docs/02_document/contracts/c6_tile_cache/descriptor_index.md`` and +task spec ``AZ-306_c6_faiss_descriptor_index.md``. + +Layout on disk (all three files live next to each other): + +- ```` — FAISS HNSW32 + IndexIDMap2 binary blob. +- ``.sha256`` — AZ-280 content-hash sidecar. +- ``.meta.json`` — :class:`IndexMetadata` + ``tile_id``↔``int64`` map. + +A construction-time triple-consistency check +(``sha256(.index) == .sha256.content == meta.sidecar_sha256_hex``) makes +any partial-write state fail closed via :class:`IndexUnavailableError`. + +The composition root entry-point is :meth:`FaissDescriptorIndex.from_config`; +the AZ-303 factory ``runtime_root.storage_factory.build_descriptor_index`` +gates the import on the ``BUILD_FAISS_INDEX`` env flag and only then +constructs the impl. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import time +from dataclasses import asdict +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import faiss +import numpy as np + +from gps_denied_onboard.components.c6_tile_cache._types import ( + HnswParams, + IndexMetadata, + TileId, +) +from gps_denied_onboard.components.c6_tile_cache.errors import ( + IndexBuildError, + IndexUnavailableError, +) +from gps_denied_onboard.helpers.sha256_sidecar import ( + SIDECAR_SUFFIX, + Sha256Sidecar, + Sha256SidecarError, +) + +if TYPE_CHECKING: + from gps_denied_onboard.config.schema import Config + +__all__ = ["META_SUFFIX", "FaissDescriptorIndex", "tile_id_to_int64"] + + +META_SUFFIX = ".meta.json" +_PRODUCER_ID = "c6_tile_cache.faiss" + + +def tile_id_to_int64(tile_id: TileId) -> int: + """Deterministic ``TileId`` → signed int64 mapping (AZ-306 convention). + + ``int.from_bytes(sha256(f"{zoom}|{lat:.8f}|{lon:.8f}").digest()[:8], + "big", signed=True)``. ``source`` is intentionally NOT part of the + hash input — a tile is identified by spatial position, not by which + feed produced it. Birthday-paradox collision probability at ~10k + tiles per provisioning is ~10⁻¹²; collisions are still detected + explicitly at rebuild time and raise :class:`IndexBuildError`. + """ + payload = f"{tile_id.zoom_level}|{tile_id.lat:.8f}|{tile_id.lon:.8f}" + digest = hashlib.sha256(payload.encode("utf-8")).digest() + return int.from_bytes(digest[:8], byteorder="big", signed=True) + + +class FaissDescriptorIndex: + """Production ``DescriptorIndex`` strategy backed by ``faiss-cpu``. + + Construction loads the existing ``.index`` via + ``faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`` and runs an + optional warm-up query. Rebuild produces a fresh atomic write of + all three sidecar files in dependency order so a crash leaves the + prior index intact. + + Not re-entrant — see ``descriptor_index.md`` § I-8. + """ + + def __init__( + self, + *, + index_path: Path, + sidecar: Sha256Sidecar, + logger: logging.Logger, + warmup_query: np.ndarray | None = None, + ) -> None: + self._index_path = Path(index_path) + self._sidecar = sidecar + self._logger = logger + self._index: Any | None = None + self._meta: IndexMetadata | None = None + self._id_to_tile_id: dict[int, TileId] = {} + self._descriptor_dim: int | None = None + + self._load() + + if warmup_query is not None: + self.search_topk(warmup_query, k=1) + + # ------------------------------------------------------------------ + # Composition-root entry point + # ------------------------------------------------------------------ + + @classmethod + def from_config(cls, config: Config) -> FaissDescriptorIndex: + """Wire up an instance from the project ``Config`` (mirrors AZ-305).""" + from gps_denied_onboard.logging import get_logger + + block = config.components["c6_tile_cache"] + index_path = Path(block.faiss_index_path) if block.faiss_index_path else ( + Path(block.root_dir) / "descriptor.index" + ) + warmup_query: np.ndarray | None = None + if block.faiss_warmup_query_path: + warmup_query = cls._load_warmup_query(Path(block.faiss_warmup_query_path)) + return cls( + index_path=index_path, + sidecar=Sha256Sidecar(), + logger=get_logger(_PRODUCER_ID), + warmup_query=warmup_query, + ) + + @staticmethod + def _load_warmup_query(path: Path) -> np.ndarray: + try: + arr = np.load(path) + except (OSError, ValueError) as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex.from_config: cannot load warmup query " + f"at {path}: {exc}" + ) from exc + return np.ascontiguousarray(arr, dtype=np.float32) + + # ------------------------------------------------------------------ + # Protocol surface + # ------------------------------------------------------------------ + + def search_topk(self, query: np.ndarray, k: int) -> list[tuple[TileId, float]]: + if self._index is None or self._descriptor_dim is None: + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: index at {self._index_path} " + "is not loaded" + ) + self._validate_query(query, self._descriptor_dim) + try: + distances, ids = self._index.search(query.reshape(1, -1), k) + except RuntimeError as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: FAISS search failed for " + f"index {self._index_path}: {exc}" + ) from exc + + results: list[tuple[TileId, float]] = [] + for raw_id, distance in zip(ids[0].tolist(), distances[0].tolist(), strict=True): + if raw_id == -1: + continue + tile_id = self._id_to_tile_id.get(int(raw_id)) + if tile_id is None: + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: int64 id {raw_id} " + f"returned by FAISS is not present in the meta.json mapping " + f"({self._index_path}{META_SUFFIX}); index is corrupt" + ) + results.append((tile_id, float(distance))) + return results + + def descriptor_dim(self) -> int: + if self._descriptor_dim is None: + raise IndexUnavailableError( + f"FaissDescriptorIndex.descriptor_dim: index at {self._index_path} " + "is not loaded" + ) + return self._descriptor_dim + + def mmap_handle(self) -> Path: + if self._index is None: + raise IndexUnavailableError( + f"FaissDescriptorIndex.mmap_handle: index at {self._index_path} " + "is not loaded" + ) + return self._index_path + + def index_metadata(self) -> IndexMetadata: + if self._meta is None: + raise IndexUnavailableError( + f"FaissDescriptorIndex.index_metadata: index at {self._index_path} " + "is not loaded" + ) + return self._meta + + def rebuild_from_descriptors( + self, + descriptors: np.ndarray, + tile_ids: list[TileId], + hnsw_params: HnswParams, + ) -> None: + self._validate_rebuild_input(descriptors, tile_ids) + descriptor_dim = int(descriptors.shape[1]) + + int64_ids = self._derive_int64_ids(tile_ids) + + index = self._build_hnsw(descriptors, int64_ids, descriptor_dim, hnsw_params) + + # ``backbone_label`` / ``backbone_sha256_hex`` are inputs to the + # provisioning chain owned by C10/C7. AZ-306 stores them as + # opaque pass-through fields populated from the existing meta + # when available so a rebuild does not lose chain identity; the + # initial-build case uses sentinels that C10's Manifest builder + # later overwrites. + previous = self._meta + backbone_label = previous.backbone_label if previous is not None else "unknown" + backbone_sha256_hex = ( + previous.backbone_sha256_hex if previous is not None else "0" * 64 + ) + + start = time.perf_counter() + temp_path = self._index_path.with_suffix(self._index_path.suffix + ".tmp") + try: + faiss.write_index(index, str(temp_path)) + except RuntimeError as exc: + self._safe_unlink(temp_path) + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild_from_descriptors: faiss.write_index " + f"failed for {temp_path}: {exc}" + ) from exc + + try: + try: + temp_bytes = temp_path.read_bytes() + except OSError as exc: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild_from_descriptors: cannot " + f"re-read temp index at {temp_path}: {exc}" + ) from exc + sidecar_sha256_hex = hashlib.sha256(temp_bytes).hexdigest() + + built_at = datetime.now(timezone.utc) + meta = IndexMetadata( + descriptor_dim=descriptor_dim, + n_vectors=len(tile_ids), + backbone_label=backbone_label, + backbone_sha256_hex=backbone_sha256_hex, + built_at=built_at, + hnsw_params=hnsw_params, + sidecar_sha256_hex=sidecar_sha256_hex, + file_path=self._index_path, + ) + meta_bytes = self._serialise_meta(meta, tile_ids, int64_ids) + + try: + self._sidecar.write_atomic_and_sidecar(self._index_path, temp_bytes) + except Sha256SidecarError as exc: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild_from_descriptors: atomic " + f"write of {self._index_path} failed: {exc}" + ) from exc + + try: + self._sidecar.write_atomic(self._meta_path, meta_bytes) + except Sha256SidecarError as exc: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild_from_descriptors: atomic " + f"write of {self._meta_path} failed: {exc}" + ) from exc + finally: + self._safe_unlink(temp_path) + + self._load() + + self._logger.info( + "c6.faiss.rebuilt", + extra={ + "kind": "c6.faiss.rebuilt", + "n_vectors": len(tile_ids), + "descriptor_dim": descriptor_dim, + "elapsed_seconds": time.perf_counter() - start, + "index_path": str(self._index_path), + }, + ) + + # ------------------------------------------------------------------ + # Internals + # ------------------------------------------------------------------ + + @property + def _meta_path(self) -> Path: + return Path(str(self._index_path) + META_SUFFIX) + + @property + def _hash_sidecar_path(self) -> Path: + return Path(str(self._index_path) + SIDECAR_SUFFIX) + + @staticmethod + def _validate_query(query: np.ndarray, descriptor_dim: int) -> None: + if not isinstance(query, np.ndarray): + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: query must be a numpy " + f"ndarray, got {type(query).__name__}" + ) + if query.dtype != np.float32: + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: query.dtype must be " + f"float32, got {query.dtype}" + ) + if query.shape != (descriptor_dim,): + raise IndexUnavailableError( + f"FaissDescriptorIndex.search_topk: query.shape must be " + f"({descriptor_dim},), got {query.shape}" + ) + if not query.flags["C_CONTIGUOUS"]: + raise IndexUnavailableError( + "FaissDescriptorIndex.search_topk: query must be C-contiguous" + ) + + @staticmethod + def _validate_rebuild_input( + descriptors: np.ndarray, tile_ids: list[TileId] + ) -> None: + if not isinstance(descriptors, np.ndarray): + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: descriptors must be a numpy " + f"ndarray, got {type(descriptors).__name__}" + ) + if descriptors.dtype != np.float32: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: descriptors.dtype must be " + f"float32, got {descriptors.dtype}" + ) + if descriptors.ndim != 2: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: descriptors.ndim must be 2, " + f"got {descriptors.ndim}" + ) + if not descriptors.flags["C_CONTIGUOUS"]: + raise IndexBuildError( + "FaissDescriptorIndex.rebuild: descriptors must be C-contiguous" + ) + if not tile_ids: + raise IndexBuildError( + "FaissDescriptorIndex.rebuild: tile_ids must be non-empty" + ) + if descriptors.shape[0] != len(tile_ids): + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: descriptors.shape[0]=" + f"{descriptors.shape[0]} != len(tile_ids)={len(tile_ids)}" + ) + + @staticmethod + def _derive_int64_ids(tile_ids: list[TileId]) -> list[int]: + seen: dict[int, TileId] = {} + ids: list[int] = [] + for tile_id in tile_ids: + mapped = tile_id_to_int64(tile_id) + existing = seen.get(mapped) + if existing is not None and existing != tile_id: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: int64-id collision: " + f"{existing!r} and {tile_id!r} both hash to {mapped}" + ) + seen[mapped] = tile_id + ids.append(mapped) + return ids + + @staticmethod + def _build_hnsw( + descriptors: np.ndarray, + int64_ids: list[int], + descriptor_dim: int, + hnsw_params: HnswParams, + ) -> Any: + metric = ( + faiss.METRIC_INNER_PRODUCT + if hnsw_params.metric.upper() == "INNER_PRODUCT" + else faiss.METRIC_L2 + ) + try: + inner = faiss.IndexHNSWFlat(descriptor_dim, hnsw_params.m, metric) + inner.hnsw.efConstruction = hnsw_params.ef_construction + inner.hnsw.efSearch = hnsw_params.ef_search + wrapped = faiss.IndexIDMap2(inner) + wrapped.add_with_ids(descriptors, np.asarray(int64_ids, dtype=np.int64)) + except RuntimeError as exc: + raise IndexBuildError( + f"FaissDescriptorIndex.rebuild: FAISS HNSW build failed: {exc}" + ) from exc + return wrapped + + @staticmethod + def _serialise_meta( + meta: IndexMetadata, + tile_ids: list[TileId], + int64_ids: list[int], + ) -> bytes: + payload = { + "descriptor_dim": meta.descriptor_dim, + "n_vectors": meta.n_vectors, + "backbone_label": meta.backbone_label, + "backbone_sha256_hex": meta.backbone_sha256_hex, + "built_at": meta.built_at.isoformat(), + "hnsw_params": asdict(meta.hnsw_params), + "sidecar_sha256_hex": meta.sidecar_sha256_hex, + "file_path": str(meta.file_path), + "id_mapping": [ + { + "int64_id": int64_id, + "zoom_level": tile_id.zoom_level, + "lat": tile_id.lat, + "lon": tile_id.lon, + } + for int64_id, tile_id in zip(int64_ids, tile_ids, strict=True) + ], + } + return json.dumps(payload, sort_keys=True, indent=2).encode("utf-8") + + def _load(self) -> None: + index_path = self._index_path + if not index_path.is_file(): + raise IndexUnavailableError( + f"FaissDescriptorIndex: .index file missing at {index_path}" + ) + + sidecar_path = self._hash_sidecar_path + if not sidecar_path.is_file(): + raise IndexUnavailableError( + f"FaissDescriptorIndex: .sha256 sidecar missing at {sidecar_path}" + ) + try: + sidecar_text = sidecar_path.read_text(encoding="ascii").strip() + except OSError as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: cannot read sidecar at {sidecar_path}: {exc}" + ) from exc + + try: + disk_digest = self._stream_sha256(index_path) + except OSError as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: cannot read index at {index_path}: {exc}" + ) from exc + if disk_digest != sidecar_text: + raise IndexUnavailableError( + f"FaissDescriptorIndex: sidecar mismatch at {index_path}: " + f"on-disk sha256={disk_digest} but sidecar={sidecar_text}" + ) + + meta_path = self._meta_path + if not meta_path.is_file(): + raise IndexUnavailableError( + f"FaissDescriptorIndex: .meta.json missing at {meta_path}" + ) + try: + meta_raw = meta_path.read_bytes() + except OSError as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: cannot read meta.json at {meta_path}: {exc}" + ) from exc + try: + meta_payload = json.loads(meta_raw.decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError) as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: meta.json at {meta_path} is malformed: {exc}" + ) from exc + + meta, id_to_tile_id = self._parse_meta(meta_payload, meta_path) + + if meta.sidecar_sha256_hex != disk_digest: + raise IndexUnavailableError( + f"FaissDescriptorIndex: meta.json sidecar_sha256_hex " + f"{meta.sidecar_sha256_hex} does not match on-disk sha256 " + f"{disk_digest} for {index_path}" + ) + + try: + index = faiss.read_index( + str(index_path), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY + ) + except RuntimeError as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: faiss.read_index failed for {index_path}: {exc}" + ) from exc + + self._apply_ef_search(index, meta.hnsw_params.ef_search) + + self._index = index + self._meta = meta + self._id_to_tile_id = id_to_tile_id + self._descriptor_dim = meta.descriptor_dim + + self._logger.info( + "c6.faiss.loaded", + extra={ + "kind": "c6.faiss.loaded", + "n_vectors": meta.n_vectors, + "descriptor_dim": meta.descriptor_dim, + "sidecar_sha256_hex": meta.sidecar_sha256_hex, + "built_at": meta.built_at.isoformat(), + "index_path": str(index_path), + }, + ) + + @staticmethod + def _parse_meta( + payload: dict[str, Any], meta_path: Path + ) -> tuple[IndexMetadata, dict[int, TileId]]: + try: + hnsw_block = payload["hnsw_params"] + id_mapping = payload["id_mapping"] + meta = IndexMetadata( + descriptor_dim=int(payload["descriptor_dim"]), + n_vectors=int(payload["n_vectors"]), + backbone_label=str(payload["backbone_label"]), + backbone_sha256_hex=str(payload["backbone_sha256_hex"]), + built_at=datetime.fromisoformat(str(payload["built_at"])), + hnsw_params=HnswParams( + m=int(hnsw_block["m"]), + ef_construction=int(hnsw_block["ef_construction"]), + ef_search=int(hnsw_block["ef_search"]), + metric=str(hnsw_block["metric"]), + ), + sidecar_sha256_hex=str(payload["sidecar_sha256_hex"]), + file_path=Path(str(payload["file_path"])), + ) + except (KeyError, TypeError, ValueError) as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: meta.json at {meta_path} has invalid " + f"shape: {exc}" + ) from exc + + id_to_tile_id: dict[int, TileId] = {} + try: + for entry in id_mapping: + int64_id = int(entry["int64_id"]) + tile_id = TileId( + zoom_level=int(entry["zoom_level"]), + lat=float(entry["lat"]), + lon=float(entry["lon"]), + ) + id_to_tile_id[int64_id] = tile_id + except (KeyError, TypeError, ValueError) as exc: + raise IndexUnavailableError( + f"FaissDescriptorIndex: meta.json id_mapping at {meta_path} is " + f"invalid: {exc}" + ) from exc + + if len(id_to_tile_id) != meta.n_vectors: + raise IndexUnavailableError( + f"FaissDescriptorIndex: meta.json id_mapping length " + f"{len(id_to_tile_id)} does not match n_vectors " + f"{meta.n_vectors} at {meta_path}" + ) + return meta, id_to_tile_id + + @staticmethod + def _apply_ef_search(index: Any, ef_search: int) -> None: + # IndexIDMap2 wraps the HNSW; reach through to set efSearch for queries. + try: + inner = faiss.downcast_index(index.index) if hasattr(index, "index") else index + if hasattr(inner, "hnsw"): + inner.hnsw.efSearch = ef_search + except RuntimeError: + # Best-effort: stale efSearch stays at FAISS default; not fatal. + pass + + @staticmethod + def _stream_sha256(path: Path) -> str: + hasher = hashlib.sha256() + with path.open("rb") as fh: + while True: + chunk = fh.read(1024 * 1024) + if not chunk: + break + hasher.update(chunk) + return hasher.hexdigest() + + @staticmethod + def _safe_unlink(path: Path) -> None: + try: + os.unlink(path) + except FileNotFoundError: + pass + except OSError: + # Intentional: cleanup failure is non-fatal; the next rebuild + # writes a fresh temp path with a random suffix anyway. Log + # the path so operators can sweep manually if it accumulates. + pass diff --git a/src/gps_denied_onboard/components/c6_tile_cache/postgres_filesystem_store.py b/src/gps_denied_onboard/components/c6_tile_cache/postgres_filesystem_store.py index 81de40c..9689635 100644 --- a/src/gps_denied_onboard/components/c6_tile_cache/postgres_filesystem_store.py +++ b/src/gps_denied_onboard/components/c6_tile_cache/postgres_filesystem_store.py @@ -45,10 +45,10 @@ from psycopg.rows import dict_row from psycopg.types.json import Jsonb from psycopg_pool import ConnectionPool -from gps_denied_onboard.components.c6_tile_cache._timestamp import iso_ts_now from gps_denied_onboard.components.c6_tile_cache._tile_pixel_handle import ( TilePixelHandle, ) +from gps_denied_onboard.components.c6_tile_cache._timestamp import iso_ts_now from gps_denied_onboard.components.c6_tile_cache._types import ( Bbox, FreshnessLabel, diff --git a/src/gps_denied_onboard/runtime_root/storage_factory.py b/src/gps_denied_onboard/runtime_root/storage_factory.py index 2ac0158..fbeeb8c 100644 --- a/src/gps_denied_onboard/runtime_root/storage_factory.py +++ b/src/gps_denied_onboard/runtime_root/storage_factory.py @@ -167,7 +167,7 @@ def build_descriptor_index(config: Config) -> DescriptorIndex: "'c6_tile_cache.faiss_descriptor_index' has not been " "built into this binary yet (AZ-306 pending)." ) from exc - return FaissDescriptorIndex(config) + return FaissDescriptorIndex.from_config(config) raise RuntimeNotAvailableError( f"DescriptorIndex runtime {runtime!r} is not buildable in this binary." ) diff --git a/tests/conftest.py b/tests/conftest.py index e721769..7827e1f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,17 @@ Tier-2-only tests are guarded by `pytest.mark.tier2` and auto-skipped on Tier-1. from __future__ import annotations +# AZ-306: faiss-cpu and torch each ship their own copy of libomp on macOS; +# loading both into the same process triggers `OMP: Error #15` and an abort +# inside FAISS native code. Setting `KMP_DUPLICATE_LIB_OK` here — BEFORE +# any test module imports either library — opts into the documented +# Intel OpenMP "duplicate-load tolerated" mode, which is the standard +# remediation for mixed-runtime scientific stacks on macOS dev hosts. +# CI Linux images don't need this; the setdefault keeps it a no-op there. +import os as _os + +_os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") + import os from collections.abc import Iterator diff --git a/tests/unit/c6_tile_cache/test_faiss_descriptor_index.py b/tests/unit/c6_tile_cache/test_faiss_descriptor_index.py new file mode 100644 index 0000000..1df714f --- /dev/null +++ b/tests/unit/c6_tile_cache/test_faiss_descriptor_index.py @@ -0,0 +1,650 @@ +"""AZ-306 — C6 ``FaissDescriptorIndex`` unit tests. + +Covers AC-1 through AC-12 plus NFR-perf-rebuild + NFR-reliability-rewrap +from ``_docs/02_tasks/todo/AZ-306_c6_faiss_descriptor_index.md``. + +The tests use the real ``faiss-cpu`` dep (promoted to main deps in this +task) — no fake-FAISS shim. AZ-303 already covers the +Protocol-conformance / factory-gate boundary; this file exercises the +production behaviour end-to-end against a real ``.index`` file. +""" + +from __future__ import annotations + +import dataclasses +import hashlib +import importlib +import json +import logging +import os +import sys +import time +from datetime import datetime +from pathlib import Path + +import faiss +import numpy as np +import pytest + +from gps_denied_onboard.components.c6_tile_cache import ( + C6TileCacheConfig, + HnswParams, + IndexBuildError, + IndexUnavailableError, + TileId, +) +from gps_denied_onboard.components.c6_tile_cache import faiss_descriptor_index as fdi_mod +from gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index import ( + META_SUFFIX, + FaissDescriptorIndex, + tile_id_to_int64, +) +from gps_denied_onboard.config.schema import Config +from gps_denied_onboard.helpers.sha256_sidecar import SIDECAR_SUFFIX, Sha256Sidecar + +# --------------------------------------------------------------------- helpers + + +def _sample_tile_ids(n: int, *, zoom: int = 18) -> list[TileId]: + # Spread the synthetic tiles over a small grid so each (lat, lon) is + # unique and cleanly inside the WGS84 valid range. + return [ + TileId( + zoom_level=zoom, + lat=49.0 + (i * 1e-4), + lon=36.0 + (i * 1e-4), + ) + for i in range(n) + ] + + +def _make_descriptors(n: int, d: int, *, seed: int = 0) -> np.ndarray: + rng = np.random.default_rng(seed) + arr = rng.standard_normal((n, d), dtype=np.float32) + return np.ascontiguousarray(arr, dtype=np.float32) + + +def _make_logger() -> logging.Logger: + log = logging.getLogger("tests.az306") + log.setLevel(logging.DEBUG) + return log + + +def _build_initial_index( + tmp_path: Path, + *, + n: int = 32, + d: int = 16, + hnsw_params: HnswParams | None = None, +) -> tuple[Path, list[TileId], np.ndarray, HnswParams]: + """Build a minimal valid index on disk and return its path + inputs.""" + index_path = tmp_path / "tiles.index" + sidecar = Sha256Sidecar() + logger = _make_logger() + descriptors = _make_descriptors(n, d, seed=0) + tile_ids = _sample_tile_ids(n) + params = hnsw_params or HnswParams(m=16, ef_construction=40, ef_search=32) + + bootstrap = _make_bootstrap_index_on_disk( + index_path=index_path, + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + assert bootstrap == index_path # sanity + + instance = FaissDescriptorIndex( + index_path=index_path, + sidecar=sidecar, + logger=logger, + ) + instance.rebuild_from_descriptors(descriptors, tile_ids, params) + return index_path, tile_ids, descriptors, params + + +def _make_bootstrap_index_on_disk( + *, + index_path: Path, + tile_ids: list[TileId], + descriptors: np.ndarray, + params: HnswParams, +) -> Path: + """Bootstrap a valid index/sidecar/meta tuple on disk via FAISS directly. + + Reused by tests that need an existing on-disk index BEFORE constructing + the class under test. This bypasses the rebuild path so the rebuild + path tests can start from a known-good baseline. + """ + inner = faiss.IndexHNSWFlat(int(descriptors.shape[1]), params.m, faiss.METRIC_L2) + inner.hnsw.efConstruction = params.ef_construction + inner.hnsw.efSearch = params.ef_search + wrapped = faiss.IndexIDMap2(inner) + int64_ids = np.asarray([tile_id_to_int64(t) for t in tile_ids], dtype=np.int64) + wrapped.add_with_ids(descriptors, int64_ids) + + faiss.write_index(wrapped, str(index_path)) + + payload = index_path.read_bytes() + digest = hashlib.sha256(payload).hexdigest() + Path(str(index_path) + SIDECAR_SUFFIX).write_text(digest, encoding="ascii") + + meta = { + "descriptor_dim": int(descriptors.shape[1]), + "n_vectors": len(tile_ids), + "backbone_label": "test_backbone", + "backbone_sha256_hex": "0" * 64, + "built_at": datetime.now().astimezone().isoformat(), + "hnsw_params": dataclasses.asdict(params), + "sidecar_sha256_hex": digest, + "file_path": str(index_path), + "id_mapping": [ + { + "int64_id": int(int64), + "zoom_level": tile_id.zoom_level, + "lat": tile_id.lat, + "lon": tile_id.lon, + } + for int64, tile_id in zip(int64_ids.tolist(), tile_ids, strict=True) + ], + } + Path(str(index_path) + META_SUFFIX).write_text( + json.dumps(meta, sort_keys=True, indent=2), encoding="utf-8" + ) + return index_path + + +def _open_existing(index_path: Path) -> FaissDescriptorIndex: + return FaissDescriptorIndex( + index_path=index_path, + sidecar=Sha256Sidecar(), + logger=_make_logger(), + ) + + +# --------------------------------------------------------------------- AC-1 + + +def test_ac1_search_topk_returns_self_match_first(tmp_path: Path) -> None: + # Arrange + n, d = 1000, 16 + descriptors = _make_descriptors(n, d, seed=42) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=16, ef_construction=40, ef_search=64) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + index = _open_existing(tmp_path / "tiles.index") + + # Act + results = index.search_topk(descriptors[0], k=5) + + # Assert + assert len(results) == 5 + assert results[0][0] == tile_ids[0] + assert results[0][1] < 1e-6 + distances = [d for _, d in results] + assert distances == sorted(distances) + + +# --------------------------------------------------------------------- AC-2 + + +def test_ac2_search_topk_returns_fewer_than_k_when_corpus_is_small(tmp_path: Path) -> None: + n, d = 3, 8 + descriptors = _make_descriptors(n, d, seed=1) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + index = _open_existing(tmp_path / "tiles.index") + + results = index.search_topk(descriptors[1], k=10) + + assert len(results) == n + returned_tile_ids = {tid for tid, _ in results} + assert returned_tile_ids == set(tile_ids) + + +# --------------------------------------------------------------------- AC-3 + + +@pytest.fixture +def loaded_index(tmp_path: Path) -> FaissDescriptorIndex: + _build_initial_index(tmp_path, n=16, d=8) + return _open_existing(tmp_path / "tiles.index") + + +def test_ac3_search_topk_rejects_wrong_shape(loaded_index: FaissDescriptorIndex) -> None: + bad_query = np.zeros(loaded_index.descriptor_dim() + 1, dtype=np.float32) + with pytest.raises(IndexUnavailableError) as exc_info: + loaded_index.search_topk(bad_query, k=3) + assert "shape" in str(exc_info.value) + + +def test_ac3_search_topk_rejects_wrong_dtype(loaded_index: FaissDescriptorIndex) -> None: + bad_query = np.zeros(loaded_index.descriptor_dim(), dtype=np.float64) + with pytest.raises(IndexUnavailableError) as exc_info: + loaded_index.search_topk(bad_query, k=3) + assert "float32" in str(exc_info.value) + + +def test_ac3_search_topk_rejects_non_contiguous(loaded_index: FaissDescriptorIndex) -> None: + base = np.zeros((2, loaded_index.descriptor_dim()), dtype=np.float32) + # Strided view -> not C-contiguous + bad_query = base[::2][0] + bad_query = bad_query.reshape(loaded_index.descriptor_dim()) + if bad_query.flags["C_CONTIGUOUS"]: + # Force a non-contiguous view via slicing across the inner axis. + wide = np.zeros((1, loaded_index.descriptor_dim() * 2), dtype=np.float32) + bad_query = wide[0, ::2] + assert not bad_query.flags["C_CONTIGUOUS"] + with pytest.raises(IndexUnavailableError) as exc_info: + loaded_index.search_topk(bad_query, k=3) + assert "contiguous" in str(exc_info.value).lower() + + +# --------------------------------------------------------------------- AC-4 + + +def test_ac4_rebuild_atomic_on_simulated_crash( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + # Arrange — bootstrap a valid baseline. + n, d = 8, 4 + descriptors = _make_descriptors(n, d, seed=1) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + original_bytes = (tmp_path / "tiles.index").read_bytes() + original_sidecar = (tmp_path / ("tiles.index" + SIDECAR_SUFFIX)).read_text() + + index = _open_existing(tmp_path / "tiles.index") + + # Act — simulate a crash AFTER the temp index is written but BEFORE + # the atomic rename swaps it into place. Real OS rename failures + # raise ``OSError``; ``Sha256Sidecar`` rewraps it to + # ``Sha256SidecarError`` which the rebuild path turns into + # ``IndexBuildError``. + sentinel = OSError("simulated crash before rename") + + def crashing_replace(*args: object, **kwargs: object) -> None: + raise sentinel + + # ``Sha256Sidecar.write_atomic_and_sidecar`` delegates to ``atomicwrites`` + # which on POSIX commits via ``os.rename``. Patching it simulates the + # commit step crashing AFTER the temp file lands but BEFORE the + # atomic swap completes — the exact AC-4 fault we want to verify. + monkeypatch.setattr(os, "rename", crashing_replace) + + new_descriptors = _make_descriptors(n, d, seed=2) + with pytest.raises(IndexBuildError): + index.rebuild_from_descriptors(new_descriptors, tile_ids, params) + + # Assert — original index + sidecar are intact and reload-able. + assert (tmp_path / "tiles.index").read_bytes() == original_bytes + assert (tmp_path / ("tiles.index" + SIDECAR_SUFFIX)).read_text() == original_sidecar + monkeypatch.undo() + reopened = _open_existing(tmp_path / "tiles.index") + assert reopened.descriptor_dim() == d + + +# --------------------------------------------------------------------- AC-5 + + +def test_ac5_rebuild_writes_correct_sidecars(tmp_path: Path) -> None: + index_path = tmp_path / "tiles.index" + n, d = 16, 8 + descriptors = _make_descriptors(n, d, seed=3) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + index = _open_existing(index_path) + + new_descriptors = _make_descriptors(n, d, seed=99) + before_call = time.time() + index.rebuild_from_descriptors(new_descriptors, tile_ids, params) + after_call = time.time() + + on_disk_sha = hashlib.sha256(index_path.read_bytes()).hexdigest() + sidecar_text = (Path(str(index_path) + SIDECAR_SUFFIX)).read_text(encoding="ascii").strip() + assert sidecar_text == on_disk_sha + + meta = json.loads((Path(str(index_path) + META_SUFFIX)).read_bytes().decode("utf-8")) + assert meta["descriptor_dim"] == d + assert meta["n_vectors"] == n + assert meta["sidecar_sha256_hex"] == on_disk_sha + assert meta["hnsw_params"] == dataclasses.asdict(params) + built_at = datetime.fromisoformat(meta["built_at"]).timestamp() + assert before_call - 1.0 <= built_at <= after_call + 1.0 + + +# --------------------------------------------------------------------- AC-6 + + +def test_ac6_corrupt_sidecar_blocks_construction(tmp_path: Path) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(8, 4, seed=4) + tile_ids = _sample_tile_ids(8) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + Path(str(index_path) + SIDECAR_SUFFIX).write_text("0" * 64, encoding="ascii") + + with pytest.raises(IndexUnavailableError) as exc_info: + _open_existing(index_path) + assert "sidecar mismatch" in str(exc_info.value) + + +# --------------------------------------------------------------------- AC-7 + + +def test_ac7_missing_meta_blocks_construction(tmp_path: Path) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(4, 4, seed=5) + tile_ids = _sample_tile_ids(4) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + Path(str(index_path) + META_SUFFIX).unlink() + + with pytest.raises(IndexUnavailableError) as exc_info: + _open_existing(index_path) + assert "meta.json missing" in str(exc_info.value) + + +def test_ac7_malformed_meta_blocks_construction(tmp_path: Path) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(4, 4, seed=6) + tile_ids = _sample_tile_ids(4) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + Path(str(index_path) + META_SUFFIX).write_text("not-json{{", encoding="utf-8") + + with pytest.raises(IndexUnavailableError) as exc_info: + _open_existing(index_path) + assert "malformed" in str(exc_info.value) + + +# --------------------------------------------------------------------- AC-8 + + +def test_ac8_warmup_query_pages_in_mmap(tmp_path: Path) -> None: + n, d = 256, 16 + descriptors = _make_descriptors(n, d, seed=7) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=16, ef_construction=40, ef_search=32) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + + # Force the OS page cache cold (best-effort; no-op on platforms + # without posix_fadvise — the test still verifies warm-up runs and + # the subsequent search latency is bounded). + if hasattr(os, "posix_fadvise") and hasattr(os, "POSIX_FADV_DONTNEED"): + with (tmp_path / "tiles.index").open("rb") as fh: + os.posix_fadvise(fh.fileno(), 0, 0, os.POSIX_FADV_DONTNEED) + + warm = descriptors[7].copy() + instance = FaissDescriptorIndex( + index_path=tmp_path / "tiles.index", + sidecar=Sha256Sidecar(), + logger=_make_logger(), + warmup_query=warm, + ) + + # Subsequent searches should be fast (sanity bound — not the C2-PT-01 + # 5 ms canonical budget; this is a "warm-up succeeded" assertion). + samples_ms: list[float] = [] + for i in range(10): + q = descriptors[i].copy() + t0 = time.perf_counter() + instance.search_topk(q, k=5) + samples_ms.append((time.perf_counter() - t0) * 1000) + samples_ms.sort() + p95 = samples_ms[int(len(samples_ms) * 0.95)] + assert p95 < 50.0, f"warm search p95={p95:.3f} ms exceeded 50 ms sanity bound" + + +# --------------------------------------------------------------------- AC-9 + + +@pytest.mark.slow +def test_ac9_search_topk_p95_latency(tmp_path: Path) -> None: + n, d = 100_000, 16 + descriptors = _make_descriptors(n, d, seed=8) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=32, ef_construction=200, ef_search=64) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + index = _open_existing(tmp_path / "tiles.index") + # warm + for i in range(10): + index.search_topk(descriptors[i], k=10) + + rng = np.random.default_rng(123) + samples_ms: list[float] = [] + for _ in range(1000): + q = np.ascontiguousarray(rng.standard_normal(d).astype(np.float32)) + t0 = time.perf_counter() + index.search_topk(q, k=10) + samples_ms.append((time.perf_counter() - t0) * 1000) + samples_ms.sort() + p95 = samples_ms[int(len(samples_ms) * 0.95)] + assert p95 <= 5.0, f"search_topk p95={p95:.3f} ms exceeds 5 ms" + + +# --------------------------------------------------------------------- AC-10 + + +_FDI_FQN = "gps_denied_onboard.components.c6_tile_cache.faiss_descriptor_index" + + +def test_ac10_factory_gate_off_does_not_import_fdi(monkeypatch: pytest.MonkeyPatch) -> None: + from gps_denied_onboard.components.c6_tile_cache import C6TileCacheConfig + from gps_denied_onboard.runtime_root.errors import RuntimeNotAvailableError + from gps_denied_onboard.runtime_root.storage_factory import build_descriptor_index + + monkeypatch.delenv("BUILD_FAISS_INDEX", raising=False) + sys.modules.pop(_FDI_FQN, None) + + config = Config.with_blocks(c6_tile_cache=C6TileCacheConfig()) + with pytest.raises(RuntimeNotAvailableError) as exc_info: + build_descriptor_index(config) + assert "faiss_hnsw" in str(exc_info.value) + assert _FDI_FQN not in sys.modules + + +# --------------------------------------------------------------------- AC-11 + + +def test_ac11_int64_id_collision_raises_index_build_error( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(4, 4, seed=9) + tile_ids = _sample_tile_ids(4) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + index = _open_existing(index_path) + pre_rebuild_bytes = index_path.read_bytes() + + # Force a deterministic collision via monkeypatch — every TileId + # maps to the same int64. + monkeypatch.setattr(fdi_mod, "tile_id_to_int64", lambda _tile_id: 42) + + new_descriptors = _make_descriptors(4, 4, seed=10) + with pytest.raises(IndexBuildError) as exc_info: + index.rebuild_from_descriptors(new_descriptors, tile_ids, params) + assert "collision" in str(exc_info.value) + # On-disk index untouched. + assert index_path.read_bytes() == pre_rebuild_bytes + + +# --------------------------------------------------------------------- AC-12 + + +def test_ac12_index_metadata_round_trip(tmp_path: Path) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(16, 8, seed=11) + tile_ids = _sample_tile_ids(16) + params = HnswParams(m=8, ef_construction=20, ef_search=16, metric="L2") + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + index = _open_existing(index_path) + index.rebuild_from_descriptors(descriptors, tile_ids, params) + + meta = index.index_metadata() + on_disk_sha = hashlib.sha256(index_path.read_bytes()).hexdigest() + assert meta.descriptor_dim == 8 + assert meta.n_vectors == 16 + assert meta.hnsw_params == params + assert meta.sidecar_sha256_hex == on_disk_sha + assert meta.file_path == index_path + + +# --------------------------------------------------------------------- NFR-perf-rebuild + + +@pytest.mark.slow +def test_nfr_perf_rebuild_under_5_minutes_for_100k(tmp_path: Path) -> None: + n, d = 100_000, 16 + descriptors = _make_descriptors(n, d, seed=12) + tile_ids = _sample_tile_ids(n) + params = HnswParams(m=32, ef_construction=200, ef_search=64) + _make_bootstrap_index_on_disk( + index_path=tmp_path / "tiles.index", + tile_ids=tile_ids, + descriptors=descriptors, + params=params, + ) + index = _open_existing(tmp_path / "tiles.index") + + t0 = time.perf_counter() + index.rebuild_from_descriptors(descriptors, tile_ids, params) + elapsed = time.perf_counter() - t0 + assert elapsed < 300.0, f"rebuild took {elapsed:.1f}s; expected < 300s" + + +# --------------------------------------------------------------------- NFR-reliability-rewrap + + +def test_nfr_reliability_search_runtime_error_rewrapped( + loaded_index: FaissDescriptorIndex, monkeypatch: pytest.MonkeyPatch +) -> None: + sentinel = RuntimeError("simulated FAISS C++ failure") + + def raising_search(*args: object, **kwargs: object) -> None: + raise sentinel + + monkeypatch.setattr(loaded_index._index, "search", raising_search) + query = np.zeros(loaded_index.descriptor_dim(), dtype=np.float32) + with pytest.raises(IndexUnavailableError) as exc_info: + loaded_index.search_topk(query, k=3) + assert exc_info.value.__cause__ is sentinel + + +def test_nfr_reliability_rebuild_runtime_error_rewrapped( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + index_path, tile_ids, descriptors, params = _build_initial_index( + tmp_path, n=8, d=4 + ) + index = _open_existing(index_path) + + sentinel = RuntimeError("simulated FAISS write failure") + + def raising_write(*args: object, **kwargs: object) -> None: + raise sentinel + + monkeypatch.setattr(faiss, "write_index", raising_write) + + with pytest.raises(IndexBuildError) as exc_info: + index.rebuild_from_descriptors(descriptors, tile_ids, params) + assert exc_info.value.__cause__ is sentinel + + +# --------------------------------------------------------------------- from_config smoke + + +def test_from_config_resolves_paths_and_warmup( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + index_path = tmp_path / "tiles.index" + descriptors = _make_descriptors(8, 4, seed=13) + tile_ids = _sample_tile_ids(8) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + warmup_path = tmp_path / "warmup.npy" + np.save(warmup_path, descriptors[0]) + + block = C6TileCacheConfig( + faiss_index_path=str(index_path), + faiss_warmup_query_path=str(warmup_path), + ) + config = Config.with_blocks(c6_tile_cache=block) + instance = FaissDescriptorIndex.from_config(config) + assert instance.descriptor_dim() == 4 + assert instance.mmap_handle() == index_path + + +def test_from_config_default_path_uses_root_dir(tmp_path: Path) -> None: + root = tmp_path / "root" + root.mkdir() + index_path = root / "descriptor.index" + descriptors = _make_descriptors(4, 4, seed=14) + tile_ids = _sample_tile_ids(4) + params = HnswParams(m=8, ef_construction=20, ef_search=16) + _make_bootstrap_index_on_disk( + index_path=index_path, tile_ids=tile_ids, descriptors=descriptors, params=params + ) + + block = C6TileCacheConfig(root_dir=str(root)) + config = Config.with_blocks(c6_tile_cache=block) + instance = FaissDescriptorIndex.from_config(config) + assert instance.mmap_handle() == index_path + + +# --------------------------------------------------------------------- module-import sanity +# The module is intentionally import-clean so the AZ-303 factory gate +# can pre-empt the import via the BUILD_FAISS_INDEX env flag. + + +def test_module_is_import_clean() -> None: + # AC-10's negative-path test removes the module from sys.modules; use + # ``import_module`` so this assertion works whether the module is + # currently cached or not. + sys.modules.pop(_FDI_FQN, None) + fresh = importlib.import_module(_FDI_FQN) + assert hasattr(fresh, "FaissDescriptorIndex") + assert hasattr(fresh, "tile_id_to_int64") diff --git a/tests/unit/c6_tile_cache/test_protocol_conformance.py b/tests/unit/c6_tile_cache/test_protocol_conformance.py index d2309db..07fc97f 100644 --- a/tests/unit/c6_tile_cache/test_protocol_conformance.py +++ b/tests/unit/c6_tile_cache/test_protocol_conformance.py @@ -297,6 +297,13 @@ def _install_fake_faiss_impl_module() -> type: def __init__(self, config: Config) -> None: self.config = config + @classmethod + def from_config(cls, config: Config) -> _FakeFaissDescriptorIndex: + # AZ-306: factory now dispatches via from_config so the production + # impl can wire its Sha256Sidecar / logger / warmup query without + # the runtime_root touching them. Mirror PostgresFilesystemStore. + return cls(config) + fake_module = types.ModuleType(_FAKE_IMPL_MODULE) fake_module.FaissDescriptorIndex = _FakeFaissDescriptorIndex # type: ignore[attr-defined] sys.modules[_FAKE_IMPL_MODULE] = fake_module