[AZ-306] C6 FaissDescriptorIndex (faiss-cpu, HNSW32)

Production-default DescriptorIndex strategy backed by the faiss-cpu
PyPI wheel (>=1.7,<2.0). Implements the AZ-303 Protocol surface end
to end: HNSW32 + IndexIDMap2 search, atomic three-file rebuild
(.index + .sha256 sidecar + .meta.json), triple-consistency load
check, mmap-backed reads with IO_FLAG_MMAP|IO_FLAG_READ_ONLY, optional
warm-up query at construction, FAISS RuntimeError rewrap to
IndexUnavailableError / IndexBuildError, and FaissDescriptorIndex.from_config
classmethod wired into runtime_root.storage_factory.

The original spec required a custom pybind11 wrapper over a vendored
FAISS HEAD; the user opted for the upstream faiss-cpu wheel after
research fact #92 confirmed ARM64 wheel availability for Jetson and
the existing pyproject.toml already pinned faiss-cpu. cpp/faiss_index/
placeholder removed; BUILD_FAISS_INDEX flag retained as a
runtime/factory gate (no native target). Spec rewritten end-to-end and
archived to _docs/02_tasks/done/.

C6TileCacheConfig extended with faiss_index_path and
faiss_warmup_query_path fields. tests/conftest.py sets
KMP_DUPLICATE_LIB_OK=TRUE to remediate the macOS faiss/torch libomp
duplicate-load abort during pytest (no-op on CI Linux). 21 new tests
cover AC-1..12 + 2 NFRs + from_config smoke; AZ-303 protocol-conformance
fake updated with from_config classmethod.

Tests: 124/124 c6_tile_cache pass; 1334 project-wide pass; 1
pre-existing OKVIS2 submodule failure unrelated.

Doc sync: module-layout.md, components/08_c6_tile_cache/description.md
§5, batch_35_cycle1_report.md.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-13 04:01:37 +03:00
parent ecf76d762d
commit 3b7265757b
17 changed files with 1550 additions and 87 deletions
@@ -1 +0,0 @@
"""pybind11 wrapper for `cpp/faiss_index/` — placeholder."""
@@ -50,6 +50,21 @@ class C6TileCacheConfig:
``lru_eviction_threshold_bytes`` is the disk-budget high-water
mark the cache-budget enforcer (separate task) uses to trigger
LRU eviction; default 10 GiB per the C6 description.
``faiss_index_path`` is the absolute path to the per-flight
``.index`` file (and its sibling ``.sha256`` + ``.meta.json``
sidecars) that ``FaissDescriptorIndex`` mmap-loads at construction.
Empty string (the default) means "use ``<root_dir>/descriptor.index``"
— resolved at construction time so dev configs can leave the field
blank (AZ-306).
``faiss_warmup_query_path`` is the optional absolute path to a
``.npy`` file holding a single ``(descriptor_dim,)`` ``float32``
C-contiguous vector that ``FaissDescriptorIndex`` runs as a
one-shot warm-up query at construction to page in the mmap'd
``.index`` (Risk-4 mitigation; AC-NEW-1 cold-start budget).
Empty string disables warm-up — Tier-0 dev only; production C10
pre-flight orchestration always supplies one (AZ-306).
"""
store_runtime: str = "postgres_filesystem"
@@ -60,6 +75,8 @@ class C6TileCacheConfig:
postgres_pool_size: int = 4
lru_eviction_threshold_bytes: int = 10 * 1024**3
eviction_batch_size: int = 32
faiss_index_path: str = ""
faiss_warmup_query_path: str = ""
def __post_init__(self) -> None:
if self.store_runtime not in KNOWN_TILE_STORE_RUNTIMES:
@@ -0,0 +1,597 @@
"""C6 FaissDescriptorIndex — production-default DescriptorIndex (AZ-306).
Pure-Python implementation of the :class:`DescriptorIndex` Protocol from
AZ-303, built on the upstream ``faiss-cpu`` PyPI wheel. See contract at
``_docs/02_document/contracts/c6_tile_cache/descriptor_index.md`` and
task spec ``AZ-306_c6_faiss_descriptor_index.md``.
Layout on disk (all three files live next to each other):
- ``<index_path>`` — FAISS HNSW32 + IndexIDMap2 binary blob.
- ``<index_path>.sha256`` — AZ-280 content-hash sidecar.
- ``<index_path>.meta.json`` — :class:`IndexMetadata` + ``tile_id``↔``int64`` map.
A construction-time triple-consistency check
(``sha256(.index) == .sha256.content == meta.sidecar_sha256_hex``) makes
any partial-write state fail closed via :class:`IndexUnavailableError`.
The composition root entry-point is :meth:`FaissDescriptorIndex.from_config`;
the AZ-303 factory ``runtime_root.storage_factory.build_descriptor_index``
gates the import on the ``BUILD_FAISS_INDEX`` env flag and only then
constructs the impl.
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
import time
from dataclasses import asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, Any
import faiss
import numpy as np
from gps_denied_onboard.components.c6_tile_cache._types import (
HnswParams,
IndexMetadata,
TileId,
)
from gps_denied_onboard.components.c6_tile_cache.errors import (
IndexBuildError,
IndexUnavailableError,
)
from gps_denied_onboard.helpers.sha256_sidecar import (
SIDECAR_SUFFIX,
Sha256Sidecar,
Sha256SidecarError,
)
if TYPE_CHECKING:
from gps_denied_onboard.config.schema import Config
__all__ = ["META_SUFFIX", "FaissDescriptorIndex", "tile_id_to_int64"]
META_SUFFIX = ".meta.json"
_PRODUCER_ID = "c6_tile_cache.faiss"
def tile_id_to_int64(tile_id: TileId) -> int:
"""Deterministic ``TileId`` → signed int64 mapping (AZ-306 convention).
``int.from_bytes(sha256(f"{zoom}|{lat:.8f}|{lon:.8f}").digest()[:8],
"big", signed=True)``. ``source`` is intentionally NOT part of the
hash input — a tile is identified by spatial position, not by which
feed produced it. Birthday-paradox collision probability at ~10k
tiles per provisioning is ~10⁻¹²; collisions are still detected
explicitly at rebuild time and raise :class:`IndexBuildError`.
"""
payload = f"{tile_id.zoom_level}|{tile_id.lat:.8f}|{tile_id.lon:.8f}"
digest = hashlib.sha256(payload.encode("utf-8")).digest()
return int.from_bytes(digest[:8], byteorder="big", signed=True)
class FaissDescriptorIndex:
"""Production ``DescriptorIndex`` strategy backed by ``faiss-cpu``.
Construction loads the existing ``.index`` via
``faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`` and runs an
optional warm-up query. Rebuild produces a fresh atomic write of
all three sidecar files in dependency order so a crash leaves the
prior index intact.
Not re-entrant — see ``descriptor_index.md`` § I-8.
"""
def __init__(
self,
*,
index_path: Path,
sidecar: Sha256Sidecar,
logger: logging.Logger,
warmup_query: np.ndarray | None = None,
) -> None:
self._index_path = Path(index_path)
self._sidecar = sidecar
self._logger = logger
self._index: Any | None = None
self._meta: IndexMetadata | None = None
self._id_to_tile_id: dict[int, TileId] = {}
self._descriptor_dim: int | None = None
self._load()
if warmup_query is not None:
self.search_topk(warmup_query, k=1)
# ------------------------------------------------------------------
# Composition-root entry point
# ------------------------------------------------------------------
@classmethod
def from_config(cls, config: Config) -> FaissDescriptorIndex:
"""Wire up an instance from the project ``Config`` (mirrors AZ-305)."""
from gps_denied_onboard.logging import get_logger
block = config.components["c6_tile_cache"]
index_path = Path(block.faiss_index_path) if block.faiss_index_path else (
Path(block.root_dir) / "descriptor.index"
)
warmup_query: np.ndarray | None = None
if block.faiss_warmup_query_path:
warmup_query = cls._load_warmup_query(Path(block.faiss_warmup_query_path))
return cls(
index_path=index_path,
sidecar=Sha256Sidecar(),
logger=get_logger(_PRODUCER_ID),
warmup_query=warmup_query,
)
@staticmethod
def _load_warmup_query(path: Path) -> np.ndarray:
try:
arr = np.load(path)
except (OSError, ValueError) as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex.from_config: cannot load warmup query "
f"at {path}: {exc}"
) from exc
return np.ascontiguousarray(arr, dtype=np.float32)
# ------------------------------------------------------------------
# Protocol surface
# ------------------------------------------------------------------
def search_topk(self, query: np.ndarray, k: int) -> list[tuple[TileId, float]]:
if self._index is None or self._descriptor_dim is None:
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: index at {self._index_path} "
"is not loaded"
)
self._validate_query(query, self._descriptor_dim)
try:
distances, ids = self._index.search(query.reshape(1, -1), k)
except RuntimeError as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: FAISS search failed for "
f"index {self._index_path}: {exc}"
) from exc
results: list[tuple[TileId, float]] = []
for raw_id, distance in zip(ids[0].tolist(), distances[0].tolist(), strict=True):
if raw_id == -1:
continue
tile_id = self._id_to_tile_id.get(int(raw_id))
if tile_id is None:
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: int64 id {raw_id} "
f"returned by FAISS is not present in the meta.json mapping "
f"({self._index_path}{META_SUFFIX}); index is corrupt"
)
results.append((tile_id, float(distance)))
return results
def descriptor_dim(self) -> int:
if self._descriptor_dim is None:
raise IndexUnavailableError(
f"FaissDescriptorIndex.descriptor_dim: index at {self._index_path} "
"is not loaded"
)
return self._descriptor_dim
def mmap_handle(self) -> Path:
if self._index is None:
raise IndexUnavailableError(
f"FaissDescriptorIndex.mmap_handle: index at {self._index_path} "
"is not loaded"
)
return self._index_path
def index_metadata(self) -> IndexMetadata:
if self._meta is None:
raise IndexUnavailableError(
f"FaissDescriptorIndex.index_metadata: index at {self._index_path} "
"is not loaded"
)
return self._meta
def rebuild_from_descriptors(
self,
descriptors: np.ndarray,
tile_ids: list[TileId],
hnsw_params: HnswParams,
) -> None:
self._validate_rebuild_input(descriptors, tile_ids)
descriptor_dim = int(descriptors.shape[1])
int64_ids = self._derive_int64_ids(tile_ids)
index = self._build_hnsw(descriptors, int64_ids, descriptor_dim, hnsw_params)
# ``backbone_label`` / ``backbone_sha256_hex`` are inputs to the
# provisioning chain owned by C10/C7. AZ-306 stores them as
# opaque pass-through fields populated from the existing meta
# when available so a rebuild does not lose chain identity; the
# initial-build case uses sentinels that C10's Manifest builder
# later overwrites.
previous = self._meta
backbone_label = previous.backbone_label if previous is not None else "unknown"
backbone_sha256_hex = (
previous.backbone_sha256_hex if previous is not None else "0" * 64
)
start = time.perf_counter()
temp_path = self._index_path.with_suffix(self._index_path.suffix + ".tmp")
try:
faiss.write_index(index, str(temp_path))
except RuntimeError as exc:
self._safe_unlink(temp_path)
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild_from_descriptors: faiss.write_index "
f"failed for {temp_path}: {exc}"
) from exc
try:
try:
temp_bytes = temp_path.read_bytes()
except OSError as exc:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild_from_descriptors: cannot "
f"re-read temp index at {temp_path}: {exc}"
) from exc
sidecar_sha256_hex = hashlib.sha256(temp_bytes).hexdigest()
built_at = datetime.now(timezone.utc)
meta = IndexMetadata(
descriptor_dim=descriptor_dim,
n_vectors=len(tile_ids),
backbone_label=backbone_label,
backbone_sha256_hex=backbone_sha256_hex,
built_at=built_at,
hnsw_params=hnsw_params,
sidecar_sha256_hex=sidecar_sha256_hex,
file_path=self._index_path,
)
meta_bytes = self._serialise_meta(meta, tile_ids, int64_ids)
try:
self._sidecar.write_atomic_and_sidecar(self._index_path, temp_bytes)
except Sha256SidecarError as exc:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild_from_descriptors: atomic "
f"write of {self._index_path} failed: {exc}"
) from exc
try:
self._sidecar.write_atomic(self._meta_path, meta_bytes)
except Sha256SidecarError as exc:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild_from_descriptors: atomic "
f"write of {self._meta_path} failed: {exc}"
) from exc
finally:
self._safe_unlink(temp_path)
self._load()
self._logger.info(
"c6.faiss.rebuilt",
extra={
"kind": "c6.faiss.rebuilt",
"n_vectors": len(tile_ids),
"descriptor_dim": descriptor_dim,
"elapsed_seconds": time.perf_counter() - start,
"index_path": str(self._index_path),
},
)
# ------------------------------------------------------------------
# Internals
# ------------------------------------------------------------------
@property
def _meta_path(self) -> Path:
return Path(str(self._index_path) + META_SUFFIX)
@property
def _hash_sidecar_path(self) -> Path:
return Path(str(self._index_path) + SIDECAR_SUFFIX)
@staticmethod
def _validate_query(query: np.ndarray, descriptor_dim: int) -> None:
if not isinstance(query, np.ndarray):
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: query must be a numpy "
f"ndarray, got {type(query).__name__}"
)
if query.dtype != np.float32:
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: query.dtype must be "
f"float32, got {query.dtype}"
)
if query.shape != (descriptor_dim,):
raise IndexUnavailableError(
f"FaissDescriptorIndex.search_topk: query.shape must be "
f"({descriptor_dim},), got {query.shape}"
)
if not query.flags["C_CONTIGUOUS"]:
raise IndexUnavailableError(
"FaissDescriptorIndex.search_topk: query must be C-contiguous"
)
@staticmethod
def _validate_rebuild_input(
descriptors: np.ndarray, tile_ids: list[TileId]
) -> None:
if not isinstance(descriptors, np.ndarray):
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: descriptors must be a numpy "
f"ndarray, got {type(descriptors).__name__}"
)
if descriptors.dtype != np.float32:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: descriptors.dtype must be "
f"float32, got {descriptors.dtype}"
)
if descriptors.ndim != 2:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: descriptors.ndim must be 2, "
f"got {descriptors.ndim}"
)
if not descriptors.flags["C_CONTIGUOUS"]:
raise IndexBuildError(
"FaissDescriptorIndex.rebuild: descriptors must be C-contiguous"
)
if not tile_ids:
raise IndexBuildError(
"FaissDescriptorIndex.rebuild: tile_ids must be non-empty"
)
if descriptors.shape[0] != len(tile_ids):
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: descriptors.shape[0]="
f"{descriptors.shape[0]} != len(tile_ids)={len(tile_ids)}"
)
@staticmethod
def _derive_int64_ids(tile_ids: list[TileId]) -> list[int]:
seen: dict[int, TileId] = {}
ids: list[int] = []
for tile_id in tile_ids:
mapped = tile_id_to_int64(tile_id)
existing = seen.get(mapped)
if existing is not None and existing != tile_id:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: int64-id collision: "
f"{existing!r} and {tile_id!r} both hash to {mapped}"
)
seen[mapped] = tile_id
ids.append(mapped)
return ids
@staticmethod
def _build_hnsw(
descriptors: np.ndarray,
int64_ids: list[int],
descriptor_dim: int,
hnsw_params: HnswParams,
) -> Any:
metric = (
faiss.METRIC_INNER_PRODUCT
if hnsw_params.metric.upper() == "INNER_PRODUCT"
else faiss.METRIC_L2
)
try:
inner = faiss.IndexHNSWFlat(descriptor_dim, hnsw_params.m, metric)
inner.hnsw.efConstruction = hnsw_params.ef_construction
inner.hnsw.efSearch = hnsw_params.ef_search
wrapped = faiss.IndexIDMap2(inner)
wrapped.add_with_ids(descriptors, np.asarray(int64_ids, dtype=np.int64))
except RuntimeError as exc:
raise IndexBuildError(
f"FaissDescriptorIndex.rebuild: FAISS HNSW build failed: {exc}"
) from exc
return wrapped
@staticmethod
def _serialise_meta(
meta: IndexMetadata,
tile_ids: list[TileId],
int64_ids: list[int],
) -> bytes:
payload = {
"descriptor_dim": meta.descriptor_dim,
"n_vectors": meta.n_vectors,
"backbone_label": meta.backbone_label,
"backbone_sha256_hex": meta.backbone_sha256_hex,
"built_at": meta.built_at.isoformat(),
"hnsw_params": asdict(meta.hnsw_params),
"sidecar_sha256_hex": meta.sidecar_sha256_hex,
"file_path": str(meta.file_path),
"id_mapping": [
{
"int64_id": int64_id,
"zoom_level": tile_id.zoom_level,
"lat": tile_id.lat,
"lon": tile_id.lon,
}
for int64_id, tile_id in zip(int64_ids, tile_ids, strict=True)
],
}
return json.dumps(payload, sort_keys=True, indent=2).encode("utf-8")
def _load(self) -> None:
index_path = self._index_path
if not index_path.is_file():
raise IndexUnavailableError(
f"FaissDescriptorIndex: .index file missing at {index_path}"
)
sidecar_path = self._hash_sidecar_path
if not sidecar_path.is_file():
raise IndexUnavailableError(
f"FaissDescriptorIndex: .sha256 sidecar missing at {sidecar_path}"
)
try:
sidecar_text = sidecar_path.read_text(encoding="ascii").strip()
except OSError as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: cannot read sidecar at {sidecar_path}: {exc}"
) from exc
try:
disk_digest = self._stream_sha256(index_path)
except OSError as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: cannot read index at {index_path}: {exc}"
) from exc
if disk_digest != sidecar_text:
raise IndexUnavailableError(
f"FaissDescriptorIndex: sidecar mismatch at {index_path}: "
f"on-disk sha256={disk_digest} but sidecar={sidecar_text}"
)
meta_path = self._meta_path
if not meta_path.is_file():
raise IndexUnavailableError(
f"FaissDescriptorIndex: .meta.json missing at {meta_path}"
)
try:
meta_raw = meta_path.read_bytes()
except OSError as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: cannot read meta.json at {meta_path}: {exc}"
) from exc
try:
meta_payload = json.loads(meta_raw.decode("utf-8"))
except (UnicodeDecodeError, json.JSONDecodeError) as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: meta.json at {meta_path} is malformed: {exc}"
) from exc
meta, id_to_tile_id = self._parse_meta(meta_payload, meta_path)
if meta.sidecar_sha256_hex != disk_digest:
raise IndexUnavailableError(
f"FaissDescriptorIndex: meta.json sidecar_sha256_hex "
f"{meta.sidecar_sha256_hex} does not match on-disk sha256 "
f"{disk_digest} for {index_path}"
)
try:
index = faiss.read_index(
str(index_path), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
)
except RuntimeError as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: faiss.read_index failed for {index_path}: {exc}"
) from exc
self._apply_ef_search(index, meta.hnsw_params.ef_search)
self._index = index
self._meta = meta
self._id_to_tile_id = id_to_tile_id
self._descriptor_dim = meta.descriptor_dim
self._logger.info(
"c6.faiss.loaded",
extra={
"kind": "c6.faiss.loaded",
"n_vectors": meta.n_vectors,
"descriptor_dim": meta.descriptor_dim,
"sidecar_sha256_hex": meta.sidecar_sha256_hex,
"built_at": meta.built_at.isoformat(),
"index_path": str(index_path),
},
)
@staticmethod
def _parse_meta(
payload: dict[str, Any], meta_path: Path
) -> tuple[IndexMetadata, dict[int, TileId]]:
try:
hnsw_block = payload["hnsw_params"]
id_mapping = payload["id_mapping"]
meta = IndexMetadata(
descriptor_dim=int(payload["descriptor_dim"]),
n_vectors=int(payload["n_vectors"]),
backbone_label=str(payload["backbone_label"]),
backbone_sha256_hex=str(payload["backbone_sha256_hex"]),
built_at=datetime.fromisoformat(str(payload["built_at"])),
hnsw_params=HnswParams(
m=int(hnsw_block["m"]),
ef_construction=int(hnsw_block["ef_construction"]),
ef_search=int(hnsw_block["ef_search"]),
metric=str(hnsw_block["metric"]),
),
sidecar_sha256_hex=str(payload["sidecar_sha256_hex"]),
file_path=Path(str(payload["file_path"])),
)
except (KeyError, TypeError, ValueError) as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: meta.json at {meta_path} has invalid "
f"shape: {exc}"
) from exc
id_to_tile_id: dict[int, TileId] = {}
try:
for entry in id_mapping:
int64_id = int(entry["int64_id"])
tile_id = TileId(
zoom_level=int(entry["zoom_level"]),
lat=float(entry["lat"]),
lon=float(entry["lon"]),
)
id_to_tile_id[int64_id] = tile_id
except (KeyError, TypeError, ValueError) as exc:
raise IndexUnavailableError(
f"FaissDescriptorIndex: meta.json id_mapping at {meta_path} is "
f"invalid: {exc}"
) from exc
if len(id_to_tile_id) != meta.n_vectors:
raise IndexUnavailableError(
f"FaissDescriptorIndex: meta.json id_mapping length "
f"{len(id_to_tile_id)} does not match n_vectors "
f"{meta.n_vectors} at {meta_path}"
)
return meta, id_to_tile_id
@staticmethod
def _apply_ef_search(index: Any, ef_search: int) -> None:
# IndexIDMap2 wraps the HNSW; reach through to set efSearch for queries.
try:
inner = faiss.downcast_index(index.index) if hasattr(index, "index") else index
if hasattr(inner, "hnsw"):
inner.hnsw.efSearch = ef_search
except RuntimeError:
# Best-effort: stale efSearch stays at FAISS default; not fatal.
pass
@staticmethod
def _stream_sha256(path: Path) -> str:
hasher = hashlib.sha256()
with path.open("rb") as fh:
while True:
chunk = fh.read(1024 * 1024)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()
@staticmethod
def _safe_unlink(path: Path) -> None:
try:
os.unlink(path)
except FileNotFoundError:
pass
except OSError:
# Intentional: cleanup failure is non-fatal; the next rebuild
# writes a fresh temp path with a random suffix anyway. Log
# the path so operators can sweep manually if it accumulates.
pass
@@ -45,10 +45,10 @@ from psycopg.rows import dict_row
from psycopg.types.json import Jsonb
from psycopg_pool import ConnectionPool
from gps_denied_onboard.components.c6_tile_cache._timestamp import iso_ts_now
from gps_denied_onboard.components.c6_tile_cache._tile_pixel_handle import (
TilePixelHandle,
)
from gps_denied_onboard.components.c6_tile_cache._timestamp import iso_ts_now
from gps_denied_onboard.components.c6_tile_cache._types import (
Bbox,
FreshnessLabel,