|
|
|
@@ -0,0 +1,597 @@
|
|
|
|
|
"""C6 FaissDescriptorIndex — production-default DescriptorIndex (AZ-306).
|
|
|
|
|
|
|
|
|
|
Pure-Python implementation of the :class:`DescriptorIndex` Protocol from
|
|
|
|
|
AZ-303, built on the upstream ``faiss-cpu`` PyPI wheel. See contract at
|
|
|
|
|
``_docs/02_document/contracts/c6_tile_cache/descriptor_index.md`` and
|
|
|
|
|
task spec ``AZ-306_c6_faiss_descriptor_index.md``.
|
|
|
|
|
|
|
|
|
|
Layout on disk (all three files live next to each other):
|
|
|
|
|
|
|
|
|
|
- ``<index_path>`` — FAISS HNSW32 + IndexIDMap2 binary blob.
|
|
|
|
|
- ``<index_path>.sha256`` — AZ-280 content-hash sidecar.
|
|
|
|
|
- ``<index_path>.meta.json`` — :class:`IndexMetadata` + ``tile_id``↔``int64`` map.
|
|
|
|
|
|
|
|
|
|
A construction-time triple-consistency check
|
|
|
|
|
(``sha256(.index) == .sha256.content == meta.sidecar_sha256_hex``) makes
|
|
|
|
|
any partial-write state fail closed via :class:`IndexUnavailableError`.
|
|
|
|
|
|
|
|
|
|
The composition root entry-point is :meth:`FaissDescriptorIndex.from_config`;
|
|
|
|
|
the AZ-303 factory ``runtime_root.storage_factory.build_descriptor_index``
|
|
|
|
|
gates the import on the ``BUILD_FAISS_INDEX`` env flag and only then
|
|
|
|
|
constructs the impl.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import hashlib
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
from dataclasses import asdict
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
|
|
|
|
|
|
import faiss
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
from gps_denied_onboard.components.c6_tile_cache._types import (
|
|
|
|
|
HnswParams,
|
|
|
|
|
IndexMetadata,
|
|
|
|
|
TileId,
|
|
|
|
|
)
|
|
|
|
|
from gps_denied_onboard.components.c6_tile_cache.errors import (
|
|
|
|
|
IndexBuildError,
|
|
|
|
|
IndexUnavailableError,
|
|
|
|
|
)
|
|
|
|
|
from gps_denied_onboard.helpers.sha256_sidecar import (
|
|
|
|
|
SIDECAR_SUFFIX,
|
|
|
|
|
Sha256Sidecar,
|
|
|
|
|
Sha256SidecarError,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from gps_denied_onboard.config.schema import Config
|
|
|
|
|
|
|
|
|
|
__all__ = ["META_SUFFIX", "FaissDescriptorIndex", "tile_id_to_int64"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
META_SUFFIX = ".meta.json"
|
|
|
|
|
_PRODUCER_ID = "c6_tile_cache.faiss"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tile_id_to_int64(tile_id: TileId) -> int:
|
|
|
|
|
"""Deterministic ``TileId`` → signed int64 mapping (AZ-306 convention).
|
|
|
|
|
|
|
|
|
|
``int.from_bytes(sha256(f"{zoom}|{lat:.8f}|{lon:.8f}").digest()[:8],
|
|
|
|
|
"big", signed=True)``. ``source`` is intentionally NOT part of the
|
|
|
|
|
hash input — a tile is identified by spatial position, not by which
|
|
|
|
|
feed produced it. Birthday-paradox collision probability at ~10k
|
|
|
|
|
tiles per provisioning is ~10⁻¹²; collisions are still detected
|
|
|
|
|
explicitly at rebuild time and raise :class:`IndexBuildError`.
|
|
|
|
|
"""
|
|
|
|
|
payload = f"{tile_id.zoom_level}|{tile_id.lat:.8f}|{tile_id.lon:.8f}"
|
|
|
|
|
digest = hashlib.sha256(payload.encode("utf-8")).digest()
|
|
|
|
|
return int.from_bytes(digest[:8], byteorder="big", signed=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FaissDescriptorIndex:
|
|
|
|
|
"""Production ``DescriptorIndex`` strategy backed by ``faiss-cpu``.
|
|
|
|
|
|
|
|
|
|
Construction loads the existing ``.index`` via
|
|
|
|
|
``faiss.read_index(IO_FLAG_MMAP | IO_FLAG_READ_ONLY)`` and runs an
|
|
|
|
|
optional warm-up query. Rebuild produces a fresh atomic write of
|
|
|
|
|
all three sidecar files in dependency order so a crash leaves the
|
|
|
|
|
prior index intact.
|
|
|
|
|
|
|
|
|
|
Not re-entrant — see ``descriptor_index.md`` § I-8.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
index_path: Path,
|
|
|
|
|
sidecar: Sha256Sidecar,
|
|
|
|
|
logger: logging.Logger,
|
|
|
|
|
warmup_query: np.ndarray | None = None,
|
|
|
|
|
) -> None:
|
|
|
|
|
self._index_path = Path(index_path)
|
|
|
|
|
self._sidecar = sidecar
|
|
|
|
|
self._logger = logger
|
|
|
|
|
self._index: Any | None = None
|
|
|
|
|
self._meta: IndexMetadata | None = None
|
|
|
|
|
self._id_to_tile_id: dict[int, TileId] = {}
|
|
|
|
|
self._descriptor_dim: int | None = None
|
|
|
|
|
|
|
|
|
|
self._load()
|
|
|
|
|
|
|
|
|
|
if warmup_query is not None:
|
|
|
|
|
self.search_topk(warmup_query, k=1)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Composition-root entry point
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_config(cls, config: Config) -> FaissDescriptorIndex:
|
|
|
|
|
"""Wire up an instance from the project ``Config`` (mirrors AZ-305)."""
|
|
|
|
|
from gps_denied_onboard.logging import get_logger
|
|
|
|
|
|
|
|
|
|
block = config.components["c6_tile_cache"]
|
|
|
|
|
index_path = Path(block.faiss_index_path) if block.faiss_index_path else (
|
|
|
|
|
Path(block.root_dir) / "descriptor.index"
|
|
|
|
|
)
|
|
|
|
|
warmup_query: np.ndarray | None = None
|
|
|
|
|
if block.faiss_warmup_query_path:
|
|
|
|
|
warmup_query = cls._load_warmup_query(Path(block.faiss_warmup_query_path))
|
|
|
|
|
return cls(
|
|
|
|
|
index_path=index_path,
|
|
|
|
|
sidecar=Sha256Sidecar(),
|
|
|
|
|
logger=get_logger(_PRODUCER_ID),
|
|
|
|
|
warmup_query=warmup_query,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _load_warmup_query(path: Path) -> np.ndarray:
|
|
|
|
|
try:
|
|
|
|
|
arr = np.load(path)
|
|
|
|
|
except (OSError, ValueError) as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.from_config: cannot load warmup query "
|
|
|
|
|
f"at {path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
return np.ascontiguousarray(arr, dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Protocol surface
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def search_topk(self, query: np.ndarray, k: int) -> list[tuple[TileId, float]]:
|
|
|
|
|
if self._index is None or self._descriptor_dim is None:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: index at {self._index_path} "
|
|
|
|
|
"is not loaded"
|
|
|
|
|
)
|
|
|
|
|
self._validate_query(query, self._descriptor_dim)
|
|
|
|
|
try:
|
|
|
|
|
distances, ids = self._index.search(query.reshape(1, -1), k)
|
|
|
|
|
except RuntimeError as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: FAISS search failed for "
|
|
|
|
|
f"index {self._index_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
results: list[tuple[TileId, float]] = []
|
|
|
|
|
for raw_id, distance in zip(ids[0].tolist(), distances[0].tolist(), strict=True):
|
|
|
|
|
if raw_id == -1:
|
|
|
|
|
continue
|
|
|
|
|
tile_id = self._id_to_tile_id.get(int(raw_id))
|
|
|
|
|
if tile_id is None:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: int64 id {raw_id} "
|
|
|
|
|
f"returned by FAISS is not present in the meta.json mapping "
|
|
|
|
|
f"({self._index_path}{META_SUFFIX}); index is corrupt"
|
|
|
|
|
)
|
|
|
|
|
results.append((tile_id, float(distance)))
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def descriptor_dim(self) -> int:
|
|
|
|
|
if self._descriptor_dim is None:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.descriptor_dim: index at {self._index_path} "
|
|
|
|
|
"is not loaded"
|
|
|
|
|
)
|
|
|
|
|
return self._descriptor_dim
|
|
|
|
|
|
|
|
|
|
def mmap_handle(self) -> Path:
|
|
|
|
|
if self._index is None:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.mmap_handle: index at {self._index_path} "
|
|
|
|
|
"is not loaded"
|
|
|
|
|
)
|
|
|
|
|
return self._index_path
|
|
|
|
|
|
|
|
|
|
def index_metadata(self) -> IndexMetadata:
|
|
|
|
|
if self._meta is None:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.index_metadata: index at {self._index_path} "
|
|
|
|
|
"is not loaded"
|
|
|
|
|
)
|
|
|
|
|
return self._meta
|
|
|
|
|
|
|
|
|
|
def rebuild_from_descriptors(
|
|
|
|
|
self,
|
|
|
|
|
descriptors: np.ndarray,
|
|
|
|
|
tile_ids: list[TileId],
|
|
|
|
|
hnsw_params: HnswParams,
|
|
|
|
|
) -> None:
|
|
|
|
|
self._validate_rebuild_input(descriptors, tile_ids)
|
|
|
|
|
descriptor_dim = int(descriptors.shape[1])
|
|
|
|
|
|
|
|
|
|
int64_ids = self._derive_int64_ids(tile_ids)
|
|
|
|
|
|
|
|
|
|
index = self._build_hnsw(descriptors, int64_ids, descriptor_dim, hnsw_params)
|
|
|
|
|
|
|
|
|
|
# ``backbone_label`` / ``backbone_sha256_hex`` are inputs to the
|
|
|
|
|
# provisioning chain owned by C10/C7. AZ-306 stores them as
|
|
|
|
|
# opaque pass-through fields populated from the existing meta
|
|
|
|
|
# when available so a rebuild does not lose chain identity; the
|
|
|
|
|
# initial-build case uses sentinels that C10's Manifest builder
|
|
|
|
|
# later overwrites.
|
|
|
|
|
previous = self._meta
|
|
|
|
|
backbone_label = previous.backbone_label if previous is not None else "unknown"
|
|
|
|
|
backbone_sha256_hex = (
|
|
|
|
|
previous.backbone_sha256_hex if previous is not None else "0" * 64
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
temp_path = self._index_path.with_suffix(self._index_path.suffix + ".tmp")
|
|
|
|
|
try:
|
|
|
|
|
faiss.write_index(index, str(temp_path))
|
|
|
|
|
except RuntimeError as exc:
|
|
|
|
|
self._safe_unlink(temp_path)
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild_from_descriptors: faiss.write_index "
|
|
|
|
|
f"failed for {temp_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
try:
|
|
|
|
|
temp_bytes = temp_path.read_bytes()
|
|
|
|
|
except OSError as exc:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild_from_descriptors: cannot "
|
|
|
|
|
f"re-read temp index at {temp_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
sidecar_sha256_hex = hashlib.sha256(temp_bytes).hexdigest()
|
|
|
|
|
|
|
|
|
|
built_at = datetime.now(timezone.utc)
|
|
|
|
|
meta = IndexMetadata(
|
|
|
|
|
descriptor_dim=descriptor_dim,
|
|
|
|
|
n_vectors=len(tile_ids),
|
|
|
|
|
backbone_label=backbone_label,
|
|
|
|
|
backbone_sha256_hex=backbone_sha256_hex,
|
|
|
|
|
built_at=built_at,
|
|
|
|
|
hnsw_params=hnsw_params,
|
|
|
|
|
sidecar_sha256_hex=sidecar_sha256_hex,
|
|
|
|
|
file_path=self._index_path,
|
|
|
|
|
)
|
|
|
|
|
meta_bytes = self._serialise_meta(meta, tile_ids, int64_ids)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self._sidecar.write_atomic_and_sidecar(self._index_path, temp_bytes)
|
|
|
|
|
except Sha256SidecarError as exc:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild_from_descriptors: atomic "
|
|
|
|
|
f"write of {self._index_path} failed: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self._sidecar.write_atomic(self._meta_path, meta_bytes)
|
|
|
|
|
except Sha256SidecarError as exc:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild_from_descriptors: atomic "
|
|
|
|
|
f"write of {self._meta_path} failed: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
finally:
|
|
|
|
|
self._safe_unlink(temp_path)
|
|
|
|
|
|
|
|
|
|
self._load()
|
|
|
|
|
|
|
|
|
|
self._logger.info(
|
|
|
|
|
"c6.faiss.rebuilt",
|
|
|
|
|
extra={
|
|
|
|
|
"kind": "c6.faiss.rebuilt",
|
|
|
|
|
"n_vectors": len(tile_ids),
|
|
|
|
|
"descriptor_dim": descriptor_dim,
|
|
|
|
|
"elapsed_seconds": time.perf_counter() - start,
|
|
|
|
|
"index_path": str(self._index_path),
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Internals
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def _meta_path(self) -> Path:
|
|
|
|
|
return Path(str(self._index_path) + META_SUFFIX)
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def _hash_sidecar_path(self) -> Path:
|
|
|
|
|
return Path(str(self._index_path) + SIDECAR_SUFFIX)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _validate_query(query: np.ndarray, descriptor_dim: int) -> None:
|
|
|
|
|
if not isinstance(query, np.ndarray):
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: query must be a numpy "
|
|
|
|
|
f"ndarray, got {type(query).__name__}"
|
|
|
|
|
)
|
|
|
|
|
if query.dtype != np.float32:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: query.dtype must be "
|
|
|
|
|
f"float32, got {query.dtype}"
|
|
|
|
|
)
|
|
|
|
|
if query.shape != (descriptor_dim,):
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex.search_topk: query.shape must be "
|
|
|
|
|
f"({descriptor_dim},), got {query.shape}"
|
|
|
|
|
)
|
|
|
|
|
if not query.flags["C_CONTIGUOUS"]:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
"FaissDescriptorIndex.search_topk: query must be C-contiguous"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _validate_rebuild_input(
|
|
|
|
|
descriptors: np.ndarray, tile_ids: list[TileId]
|
|
|
|
|
) -> None:
|
|
|
|
|
if not isinstance(descriptors, np.ndarray):
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: descriptors must be a numpy "
|
|
|
|
|
f"ndarray, got {type(descriptors).__name__}"
|
|
|
|
|
)
|
|
|
|
|
if descriptors.dtype != np.float32:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: descriptors.dtype must be "
|
|
|
|
|
f"float32, got {descriptors.dtype}"
|
|
|
|
|
)
|
|
|
|
|
if descriptors.ndim != 2:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: descriptors.ndim must be 2, "
|
|
|
|
|
f"got {descriptors.ndim}"
|
|
|
|
|
)
|
|
|
|
|
if not descriptors.flags["C_CONTIGUOUS"]:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
"FaissDescriptorIndex.rebuild: descriptors must be C-contiguous"
|
|
|
|
|
)
|
|
|
|
|
if not tile_ids:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
"FaissDescriptorIndex.rebuild: tile_ids must be non-empty"
|
|
|
|
|
)
|
|
|
|
|
if descriptors.shape[0] != len(tile_ids):
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: descriptors.shape[0]="
|
|
|
|
|
f"{descriptors.shape[0]} != len(tile_ids)={len(tile_ids)}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _derive_int64_ids(tile_ids: list[TileId]) -> list[int]:
|
|
|
|
|
seen: dict[int, TileId] = {}
|
|
|
|
|
ids: list[int] = []
|
|
|
|
|
for tile_id in tile_ids:
|
|
|
|
|
mapped = tile_id_to_int64(tile_id)
|
|
|
|
|
existing = seen.get(mapped)
|
|
|
|
|
if existing is not None and existing != tile_id:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: int64-id collision: "
|
|
|
|
|
f"{existing!r} and {tile_id!r} both hash to {mapped}"
|
|
|
|
|
)
|
|
|
|
|
seen[mapped] = tile_id
|
|
|
|
|
ids.append(mapped)
|
|
|
|
|
return ids
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _build_hnsw(
|
|
|
|
|
descriptors: np.ndarray,
|
|
|
|
|
int64_ids: list[int],
|
|
|
|
|
descriptor_dim: int,
|
|
|
|
|
hnsw_params: HnswParams,
|
|
|
|
|
) -> Any:
|
|
|
|
|
metric = (
|
|
|
|
|
faiss.METRIC_INNER_PRODUCT
|
|
|
|
|
if hnsw_params.metric.upper() == "INNER_PRODUCT"
|
|
|
|
|
else faiss.METRIC_L2
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
inner = faiss.IndexHNSWFlat(descriptor_dim, hnsw_params.m, metric)
|
|
|
|
|
inner.hnsw.efConstruction = hnsw_params.ef_construction
|
|
|
|
|
inner.hnsw.efSearch = hnsw_params.ef_search
|
|
|
|
|
wrapped = faiss.IndexIDMap2(inner)
|
|
|
|
|
wrapped.add_with_ids(descriptors, np.asarray(int64_ids, dtype=np.int64))
|
|
|
|
|
except RuntimeError as exc:
|
|
|
|
|
raise IndexBuildError(
|
|
|
|
|
f"FaissDescriptorIndex.rebuild: FAISS HNSW build failed: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
return wrapped
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _serialise_meta(
|
|
|
|
|
meta: IndexMetadata,
|
|
|
|
|
tile_ids: list[TileId],
|
|
|
|
|
int64_ids: list[int],
|
|
|
|
|
) -> bytes:
|
|
|
|
|
payload = {
|
|
|
|
|
"descriptor_dim": meta.descriptor_dim,
|
|
|
|
|
"n_vectors": meta.n_vectors,
|
|
|
|
|
"backbone_label": meta.backbone_label,
|
|
|
|
|
"backbone_sha256_hex": meta.backbone_sha256_hex,
|
|
|
|
|
"built_at": meta.built_at.isoformat(),
|
|
|
|
|
"hnsw_params": asdict(meta.hnsw_params),
|
|
|
|
|
"sidecar_sha256_hex": meta.sidecar_sha256_hex,
|
|
|
|
|
"file_path": str(meta.file_path),
|
|
|
|
|
"id_mapping": [
|
|
|
|
|
{
|
|
|
|
|
"int64_id": int64_id,
|
|
|
|
|
"zoom_level": tile_id.zoom_level,
|
|
|
|
|
"lat": tile_id.lat,
|
|
|
|
|
"lon": tile_id.lon,
|
|
|
|
|
}
|
|
|
|
|
for int64_id, tile_id in zip(int64_ids, tile_ids, strict=True)
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
return json.dumps(payload, sort_keys=True, indent=2).encode("utf-8")
|
|
|
|
|
|
|
|
|
|
def _load(self) -> None:
|
|
|
|
|
index_path = self._index_path
|
|
|
|
|
if not index_path.is_file():
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: .index file missing at {index_path}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
sidecar_path = self._hash_sidecar_path
|
|
|
|
|
if not sidecar_path.is_file():
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: .sha256 sidecar missing at {sidecar_path}"
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
sidecar_text = sidecar_path.read_text(encoding="ascii").strip()
|
|
|
|
|
except OSError as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: cannot read sidecar at {sidecar_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
disk_digest = self._stream_sha256(index_path)
|
|
|
|
|
except OSError as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: cannot read index at {index_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
if disk_digest != sidecar_text:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: sidecar mismatch at {index_path}: "
|
|
|
|
|
f"on-disk sha256={disk_digest} but sidecar={sidecar_text}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
meta_path = self._meta_path
|
|
|
|
|
if not meta_path.is_file():
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: .meta.json missing at {meta_path}"
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
meta_raw = meta_path.read_bytes()
|
|
|
|
|
except OSError as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: cannot read meta.json at {meta_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
try:
|
|
|
|
|
meta_payload = json.loads(meta_raw.decode("utf-8"))
|
|
|
|
|
except (UnicodeDecodeError, json.JSONDecodeError) as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: meta.json at {meta_path} is malformed: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
meta, id_to_tile_id = self._parse_meta(meta_payload, meta_path)
|
|
|
|
|
|
|
|
|
|
if meta.sidecar_sha256_hex != disk_digest:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: meta.json sidecar_sha256_hex "
|
|
|
|
|
f"{meta.sidecar_sha256_hex} does not match on-disk sha256 "
|
|
|
|
|
f"{disk_digest} for {index_path}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
index = faiss.read_index(
|
|
|
|
|
str(index_path), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
|
|
|
|
|
)
|
|
|
|
|
except RuntimeError as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: faiss.read_index failed for {index_path}: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
self._apply_ef_search(index, meta.hnsw_params.ef_search)
|
|
|
|
|
|
|
|
|
|
self._index = index
|
|
|
|
|
self._meta = meta
|
|
|
|
|
self._id_to_tile_id = id_to_tile_id
|
|
|
|
|
self._descriptor_dim = meta.descriptor_dim
|
|
|
|
|
|
|
|
|
|
self._logger.info(
|
|
|
|
|
"c6.faiss.loaded",
|
|
|
|
|
extra={
|
|
|
|
|
"kind": "c6.faiss.loaded",
|
|
|
|
|
"n_vectors": meta.n_vectors,
|
|
|
|
|
"descriptor_dim": meta.descriptor_dim,
|
|
|
|
|
"sidecar_sha256_hex": meta.sidecar_sha256_hex,
|
|
|
|
|
"built_at": meta.built_at.isoformat(),
|
|
|
|
|
"index_path": str(index_path),
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _parse_meta(
|
|
|
|
|
payload: dict[str, Any], meta_path: Path
|
|
|
|
|
) -> tuple[IndexMetadata, dict[int, TileId]]:
|
|
|
|
|
try:
|
|
|
|
|
hnsw_block = payload["hnsw_params"]
|
|
|
|
|
id_mapping = payload["id_mapping"]
|
|
|
|
|
meta = IndexMetadata(
|
|
|
|
|
descriptor_dim=int(payload["descriptor_dim"]),
|
|
|
|
|
n_vectors=int(payload["n_vectors"]),
|
|
|
|
|
backbone_label=str(payload["backbone_label"]),
|
|
|
|
|
backbone_sha256_hex=str(payload["backbone_sha256_hex"]),
|
|
|
|
|
built_at=datetime.fromisoformat(str(payload["built_at"])),
|
|
|
|
|
hnsw_params=HnswParams(
|
|
|
|
|
m=int(hnsw_block["m"]),
|
|
|
|
|
ef_construction=int(hnsw_block["ef_construction"]),
|
|
|
|
|
ef_search=int(hnsw_block["ef_search"]),
|
|
|
|
|
metric=str(hnsw_block["metric"]),
|
|
|
|
|
),
|
|
|
|
|
sidecar_sha256_hex=str(payload["sidecar_sha256_hex"]),
|
|
|
|
|
file_path=Path(str(payload["file_path"])),
|
|
|
|
|
)
|
|
|
|
|
except (KeyError, TypeError, ValueError) as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: meta.json at {meta_path} has invalid "
|
|
|
|
|
f"shape: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
id_to_tile_id: dict[int, TileId] = {}
|
|
|
|
|
try:
|
|
|
|
|
for entry in id_mapping:
|
|
|
|
|
int64_id = int(entry["int64_id"])
|
|
|
|
|
tile_id = TileId(
|
|
|
|
|
zoom_level=int(entry["zoom_level"]),
|
|
|
|
|
lat=float(entry["lat"]),
|
|
|
|
|
lon=float(entry["lon"]),
|
|
|
|
|
)
|
|
|
|
|
id_to_tile_id[int64_id] = tile_id
|
|
|
|
|
except (KeyError, TypeError, ValueError) as exc:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: meta.json id_mapping at {meta_path} is "
|
|
|
|
|
f"invalid: {exc}"
|
|
|
|
|
) from exc
|
|
|
|
|
|
|
|
|
|
if len(id_to_tile_id) != meta.n_vectors:
|
|
|
|
|
raise IndexUnavailableError(
|
|
|
|
|
f"FaissDescriptorIndex: meta.json id_mapping length "
|
|
|
|
|
f"{len(id_to_tile_id)} does not match n_vectors "
|
|
|
|
|
f"{meta.n_vectors} at {meta_path}"
|
|
|
|
|
)
|
|
|
|
|
return meta, id_to_tile_id
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _apply_ef_search(index: Any, ef_search: int) -> None:
|
|
|
|
|
# IndexIDMap2 wraps the HNSW; reach through to set efSearch for queries.
|
|
|
|
|
try:
|
|
|
|
|
inner = faiss.downcast_index(index.index) if hasattr(index, "index") else index
|
|
|
|
|
if hasattr(inner, "hnsw"):
|
|
|
|
|
inner.hnsw.efSearch = ef_search
|
|
|
|
|
except RuntimeError:
|
|
|
|
|
# Best-effort: stale efSearch stays at FAISS default; not fatal.
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _stream_sha256(path: Path) -> str:
|
|
|
|
|
hasher = hashlib.sha256()
|
|
|
|
|
with path.open("rb") as fh:
|
|
|
|
|
while True:
|
|
|
|
|
chunk = fh.read(1024 * 1024)
|
|
|
|
|
if not chunk:
|
|
|
|
|
break
|
|
|
|
|
hasher.update(chunk)
|
|
|
|
|
return hasher.hexdigest()
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _safe_unlink(path: Path) -> None:
|
|
|
|
|
try:
|
|
|
|
|
os.unlink(path)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
pass
|
|
|
|
|
except OSError:
|
|
|
|
|
# Intentional: cleanup failure is non-fatal; the next rebuild
|
|
|
|
|
# writes a fresh temp path with a random suffix anyway. Log
|
|
|
|
|
# the path so operators can sweep manually if it accumulates.
|
|
|
|
|
pass
|