"""Deterministic tile-cache fixture builder. Reads source imagery + ground-truth from ``_docs/00_problem/input_data/`` and emits a reproducible ``tile-cache-fixture`` tree at ``--output``: / tiles///.jpg # tile JPEG bodies tiles///.json # per-tile sidecar (mirrors `tiles` row) manifest.csv # sorted manifest with content hashes descriptors.index # stub FAISS HNSW index (optional) The builder is invokable directly (``python -m runner.fixtures.tile_cache_builder.builder``) or inside the per-builder Docker image (``Dockerfile`` in this directory). Reproducibility primitives (AC-1): * Source files are sorted lexicographically before processing. * PIL JPEG encode uses ``quality=85, optimize=False, progressive=False`` with explicit ``subsampling=2`` (4:2:0) — these are the PIL defaults but pinning them protects against future PIL changes. * Manifest rows are sorted by ``(zoom_level, tile_x, tile_y)`` before CSV serialization. * FAISS index (when ``faiss-cpu`` is importable) is built single-threaded with ``faiss.omp_set_num_threads(1)`` and a fixed seed (``faiss.write_index`` output is deterministic given the same descriptor sequence). * Descriptors are SHA-256-derived stub vectors — sufficient for schema contracts, NOT a substitute for real VPR descriptors emitted by C2. Public-boundary discipline: this module does NOT import any ``src/gps_denied_onboard`` symbol. The on-disk schema lives in ``_docs/00_problem/restrictions.md`` § Satellite Imagery and is the only contract this builder honours. """ from __future__ import annotations import argparse import csv import datetime as _dt import hashlib import io import json import logging import os import shutil import sys from dataclasses import dataclass from pathlib import Path from typing import Iterable logger = logging.getLogger(__name__) # AC-2: Derkachi route bbox (placeholder centre — refined when D-PROJ-3 # lands the production Derkachi sector polygon). Lat/Lon are the bbox # corners; the builder emits one tile per `(zoom, tx, ty)` covering the # rectangle. DERKACHI_BBOX = { "min_lat": 50.05, "max_lat": 50.10, "min_lon": 36.10, "max_lon": 36.20, } # Static "frozen" capture date for the base fixture. AC-3's age-injector # operates on a clone; the BASE fixture's date is intentionally fixed in # the past so the C6 freshness check (6-mo active-conflict / # 12-mo rear) treats it as fresh for the default scenarios. BASE_CAPTURE_DATE = "2025-11-01" # Zoom level used by C6 for the Derkachi corpus (matches restrictions.md # §Satellite Imagery: ≥0.5 m/px at the cache interface). DEFAULT_ZOOM = 18 # Tile dimensions (slippy/XYZ convention). TILE_W = 256 TILE_H = 256 # Stub-descriptor dimensionality (matches the production VPR descriptor # size declared in `_docs/02_document/components/c2_vpr/description.md` # for layout compatibility; the values themselves are SHA-derived stubs). DESCRIPTOR_DIM = 256 @dataclass(frozen=True) class TileEntry: """One row of the manifest. Sorted before CSV serialisation.""" zoom_level: int tile_x: int tile_y: int capture_date: str source: str m_per_px: float jpeg_path: str content_hash: str provenance: str def _iter_stills(input_dir: Path) -> Iterable[Path]: """Yield AD000NNN.jpg files in sorted order.""" for p in sorted(input_dir.glob("AD*.jpg")): yield p def _iter_paired_gmaps(input_dir: Path) -> set[str]: """Return the set of AD000NNN basenames that have a paired _gmaps.png.""" return {p.stem.removesuffix("_gmaps") for p in input_dir.glob("AD*_gmaps.png")} def _slippy_xy_from_index(idx: int, zoom: int) -> tuple[int, int]: """Deterministic (tile_x, tile_y) layout: row-major raster across the Derkachi bbox. The mapping is NOT geodetically meaningful — it is a stable placeholder until D-PROJ-3 supplies the production tile-matrix transform. Each `idx` gets a unique (tx, ty) so the manifest stays collision-free. """ cols = 16 # 16x16 grid covers 256 tiles → comfortably more than 60 stills + 1 bbox tx = (idx % cols) + (1 << (zoom - 1)) ty = (idx // cols) + (1 << (zoom - 1)) return tx, ty def _stub_jpeg_bytes(seed: int) -> bytes: """Render a deterministic 256x256 JPEG keyed on `seed`. No PIL randomness, no timestamps in metadata. The body is a 4-band gradient (R,G,B,grey) computed from `seed`; OpenCV's imdecode + C2's descriptor pipeline both treat the bytes as a valid JPEG. """ from PIL import Image # noqa: PLC0415 — heavy import, deferred r = (seed * 37) & 0xFF g = (seed * 53) & 0xFF b = (seed * 71) & 0xFF img = Image.new("RGB", (TILE_W, TILE_H), color=(r, g, b)) buf = io.BytesIO() img.save( buf, format="JPEG", quality=85, optimize=False, progressive=False, subsampling=2, ) return buf.getvalue() def _real_tile_jpeg_bytes(gmaps_png: Path) -> bytes: """Re-encode a paired _gmaps.png as a deterministic JPEG.""" from PIL import Image # noqa: PLC0415 img = Image.open(gmaps_png).convert("RGB").resize((TILE_W, TILE_H), Image.BICUBIC) buf = io.BytesIO() img.save( buf, format="JPEG", quality=85, optimize=False, progressive=False, subsampling=2, ) return buf.getvalue() def _content_hash(b: bytes) -> str: return hashlib.sha256(b).hexdigest() def _sidecar_dict(entry: TileEntry) -> dict: """Per-tile JSON sidecar (mirrors the `tiles` row content per data_model.md § 2.1.2). """ return { "zoom_level": entry.zoom_level, "tile_x": entry.tile_x, "tile_y": entry.tile_y, "capture_date": entry.capture_date, "source": entry.source, "m_per_px": entry.m_per_px, "content_hash": entry.content_hash, "provenance": entry.provenance, } def _emit_tile(out_dir: Path, entry: TileEntry, jpeg_bytes: bytes) -> None: """Write `/tiles///.{jpg,json}` atomically.""" tile_dir = out_dir / "tiles" / str(entry.zoom_level) / str(entry.tile_x) tile_dir.mkdir(parents=True, exist_ok=True) jpg_path = tile_dir / f"{entry.tile_y}.jpg" json_path = tile_dir / f"{entry.tile_y}.json" jpg_path.write_bytes(jpeg_bytes) json_path.write_text( json.dumps(_sidecar_dict(entry), sort_keys=True, separators=(",", ":")) + "\n" ) def _write_manifest(out_dir: Path, rows: list[TileEntry]) -> Path: """Write the sorted manifest CSV.""" manifest_path = out_dir / "manifest.csv" with manifest_path.open("w", newline="") as fp: writer = csv.writer(fp, lineterminator="\n") writer.writerow( [ "zoom_level", "tile_x", "tile_y", "capture_date", "source", "m_per_px", "jpeg_path", "content_hash", "provenance", ] ) for r in sorted(rows, key=lambda x: (x.zoom_level, x.tile_x, x.tile_y)): writer.writerow( [ r.zoom_level, r.tile_x, r.tile_y, r.capture_date, r.source, f"{r.m_per_px:.6f}", r.jpeg_path, r.content_hash, r.provenance, ] ) return manifest_path def _write_descriptors_index(out_dir: Path, rows: list[TileEntry]) -> Path | None: """Emit a deterministic FAISS HNSW index of stub descriptors. Returns the index path on success, or None when faiss-cpu is not importable. The unit test gates on importorskip("faiss"); the production build inside ``Dockerfile`` ships faiss-cpu so this path is always exercised in CI. """ try: import faiss # noqa: PLC0415 import numpy as np # noqa: PLC0415 except ImportError: logger.warning( "faiss / numpy not importable in this environment — " "skipping descriptors.index emission. The fixture is still " "usable for schema-only scenarios; VPR-matching scenarios " "need the Docker build." ) return None # Single-thread + deterministic seed → bit-stable output. faiss.omp_set_num_threads(1) descriptors = np.zeros((len(rows), DESCRIPTOR_DIM), dtype=np.float32) for i, r in enumerate(sorted(rows, key=lambda x: (x.zoom_level, x.tile_x, x.tile_y))): # SHA-derived stub: hash the tile's content_hash + index byte # into DESCRIPTOR_DIM float32s. Stable across runs because # content_hash is stable. seed_bytes = hashlib.sha256( f"{r.content_hash}|{i}".encode("ascii") ).digest() rng = np.random.default_rng(int.from_bytes(seed_bytes[:8], "big")) descriptors[i] = rng.standard_normal(DESCRIPTOR_DIM, dtype=np.float32) # HNSW32 + IP metric is the C2 production choice (see # _docs/02_document/components/c2_vpr/description.md). index = faiss.IndexHNSWFlat(DESCRIPTOR_DIM, 32, faiss.METRIC_INNER_PRODUCT) index.hnsw.efConstruction = 40 index.hnsw.efSearch = 16 index.add(descriptors) index_path = out_dir / "descriptors.index" faiss.write_index(index, str(index_path)) return index_path def build(input_dir: Path, output_dir: Path) -> dict: """Build the tile-cache fixture under `output_dir` from `input_dir`. Returns a manifest summary dict for caller logging: {"tile_count": int, "stub_count": int, "real_count": int, "manifest_hash": str, "descriptors_index_hash": str | None} The output directory is wiped and re-created so two consecutive invocations against the same input produce bit-identical trees (AC-1). """ if output_dir.exists(): shutil.rmtree(output_dir) output_dir.mkdir(parents=True) paired = _iter_paired_gmaps(input_dir) stills = list(_iter_stills(input_dir)) if not stills: raise FileNotFoundError( f"No AD*.jpg files under {input_dir} — input_data/ may be missing" ) rows: list[TileEntry] = [] stub_count = 0 real_count = 0 # AC-2: one tile entry per still + one entry for the Derkachi bbox # (index 60 in our deterministic layout). for idx, still in enumerate(stills): tx, ty = _slippy_xy_from_index(idx, DEFAULT_ZOOM) if still.stem in paired: jpeg = _real_tile_jpeg_bytes(input_dir / f"{still.stem}_gmaps.png") source = "googlemaps" provenance = f"paired_gmaps:{still.stem}" real_count += 1 else: # D-PROJ-3 stub-tile fallback per AZ-407 spec lines 18–19. jpeg = _stub_jpeg_bytes(idx + 1) source = "stub" provenance = "STUB" stub_count += 1 entry = TileEntry( zoom_level=DEFAULT_ZOOM, tile_x=tx, tile_y=ty, capture_date=BASE_CAPTURE_DATE, source=source, m_per_px=0.5, jpeg_path=f"tiles/{DEFAULT_ZOOM}/{tx}/{ty}.jpg", content_hash=_content_hash(jpeg), provenance=provenance, ) rows.append(entry) _emit_tile(output_dir, entry, jpeg) # AC-2: Derkachi route bbox entry — single representative tile at # the bbox centre. Real coverage of the bbox is owned by D-PROJ-3. tx, ty = _slippy_xy_from_index(60, DEFAULT_ZOOM) bbox_jpeg = _stub_jpeg_bytes(60 + 1) bbox_entry = TileEntry( zoom_level=DEFAULT_ZOOM, tile_x=tx, tile_y=ty, capture_date=BASE_CAPTURE_DATE, source="stub", m_per_px=0.5, jpeg_path=f"tiles/{DEFAULT_ZOOM}/{tx}/{ty}.jpg", content_hash=_content_hash(bbox_jpeg), provenance=( f"STUB_BBOX:derkachi:{DERKACHI_BBOX['min_lat']}," f"{DERKACHI_BBOX['min_lon']},{DERKACHI_BBOX['max_lat']}," f"{DERKACHI_BBOX['max_lon']}" ), ) rows.append(bbox_entry) _emit_tile(output_dir, bbox_entry, bbox_jpeg) stub_count += 1 manifest_path = _write_manifest(output_dir, rows) manifest_hash = hashlib.sha256(manifest_path.read_bytes()).hexdigest() index_path = _write_descriptors_index(output_dir, rows) if index_path is not None: descriptors_hash = hashlib.sha256(index_path.read_bytes()).hexdigest() else: descriptors_hash = None return { "tile_count": len(rows), "stub_count": stub_count, "real_count": real_count, "paired_gmaps_count": len(paired), "manifest_hash": manifest_hash, "descriptors_index_hash": descriptors_hash, } def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Build the tile-cache test fixture") parser.add_argument( "--input-dir", type=Path, required=True, help="Directory containing AD*.jpg and AD*_gmaps.png source files", ) parser.add_argument( "--output-dir", type=Path, required=True, help="Output directory for the tile-cache fixture tree", ) parser.add_argument( "--quiet", action="store_true", help="Suppress per-tile log lines (errors still surface)", ) args = parser.parse_args(argv) logging.basicConfig( level=logging.WARNING if args.quiet else logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) summary = build(args.input_dir, args.output_dir) json.dump(summary, sys.stdout, sort_keys=True, indent=2) sys.stdout.write("\n") return 0 if __name__ == "__main__": raise SystemExit(main())