"""Tests for the AZ-407 tile-cache-builder. Covers AC-1 (deterministic), AC-2 (footprint coverage), AC-7 (provenance docs present). FAISS portion gated via importorskip — the production Docker image installs faiss-cpu, but the local venv runs the test fine without it (asserting only manifest + tile-filesystem determinism). """ from __future__ import annotations import csv import hashlib import json import os import subprocess import sys from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parents[3] INPUT_DIR = REPO_ROOT / "_docs" / "00_problem" / "input_data" BUILDER_DIR = REPO_ROOT / "e2e" / "fixtures" / "tile-cache-builder" BUILDER_PY = BUILDER_DIR / "builder.py" def _run_builder(output_dir: Path) -> dict: """Invoke builder.py against the project input_data, return summary.""" env = dict(os.environ) env["PYTHONHASHSEED"] = "0" result = subprocess.run( [ sys.executable, str(BUILDER_PY), "--input-dir", str(INPUT_DIR), "--output-dir", str(output_dir), "--quiet", ], check=True, capture_output=True, text=True, env=env, ) return json.loads(result.stdout) def _walk_file_hashes(root: Path) -> dict[str, str]: """Return {relative_path: sha256_hex} for every file under root.""" hashes: dict[str, str] = {} for path in sorted(root.rglob("*")): if not path.is_file(): continue rel = path.relative_to(root).as_posix() hashes[rel] = hashlib.sha256(path.read_bytes()).hexdigest() return hashes def test_builder_is_deterministic(tmp_path: Path) -> None: """AC-1: two consecutive runs produce a bit-identical output tree.""" # Arrange out_a = tmp_path / "run-a" out_b = tmp_path / "run-b" # Act summary_a = _run_builder(out_a) summary_b = _run_builder(out_b) # Assert assert summary_a["manifest_hash"] == summary_b["manifest_hash"], ( f"manifest hash drift: {summary_a['manifest_hash']} vs " f"{summary_b['manifest_hash']} — AC-1 broken" ) if summary_a["descriptors_index_hash"] is not None: assert summary_a["descriptors_index_hash"] == summary_b["descriptors_index_hash"], ( "FAISS descriptors.index drift between runs — AC-1 broken" ) hashes_a = _walk_file_hashes(out_a) hashes_b = _walk_file_hashes(out_b) assert hashes_a == hashes_b, ( "Tile filesystem byte-drift between runs — AC-1 broken. " f"diff(a-b)={set(hashes_a) - set(hashes_b)}, " f"diff(b-a)={set(hashes_b) - set(hashes_a)}" ) def test_manifest_covers_60_stills_plus_bbox(tmp_path: Path) -> None: """AC-2: manifest contains 60 still entries + 1 Derkachi bbox entry.""" # Arrange out = tmp_path / "run" # Act summary = _run_builder(out) # Assert assert summary["tile_count"] == 61, ( f"expected 60 stills + 1 bbox = 61 rows, got {summary['tile_count']}" ) manifest_path = out / "manifest.csv" assert manifest_path.exists() with manifest_path.open() as fp: rows = list(csv.DictReader(fp)) assert len(rows) == 61 bbox_rows = [r for r in rows if r["provenance"].startswith("STUB_BBOX:derkachi")] assert len(bbox_rows) == 1, "exactly one Derkachi bbox row required" for r in rows: assert float(r["m_per_px"]) >= 0.5, ( f"row {r['tile_x']},{r['tile_y']} below 0.5 m/px AC-8.1 floor" ) def test_manifest_schema_matches_restrictions_md(tmp_path: Path) -> None: """AC-2 / data_model.md alignment: column order is the contract.""" # Arrange out = tmp_path / "run" _run_builder(out) # Act with (out / "manifest.csv").open() as fp: reader = csv.reader(fp) header = next(reader) # Assert assert header == [ "zoom_level", "tile_x", "tile_y", "capture_date", "source", "m_per_px", "jpeg_path", "content_hash", "provenance", ] def test_real_tile_count_matches_paired_gmaps(tmp_path: Path) -> None: """AC-2: every `_gmaps.png` reference becomes a `source=googlemaps` row.""" # Arrange out = tmp_path / "run" # Act summary = _run_builder(out) # Assert paired_count = len(list(INPUT_DIR.glob("AD*_gmaps.png"))) assert summary["real_count"] == paired_count, ( f"paired _gmaps.png files: {paired_count}, real rows: {summary['real_count']}" ) assert summary["paired_gmaps_count"] == paired_count def test_sidecar_json_per_tile(tmp_path: Path) -> None: """data_model.md § 2.1.2: every tile JPEG has a matching JSON sidecar.""" # Arrange out = tmp_path / "run" _run_builder(out) # Act jpgs = sorted((out / "tiles").rglob("*.jpg")) jsons = sorted((out / "tiles").rglob("*.json")) # Assert assert len(jpgs) == len(jsons) > 0 for jpg, sidecar in zip(jpgs, jsons, strict=True): assert jpg.with_suffix(".json") == sidecar data = json.loads(sidecar.read_text()) assert {"zoom_level", "tile_x", "tile_y", "capture_date", "source"} <= set(data) @pytest.mark.skipif( not BUILDER_DIR.joinpath("README.md").exists(), reason="builder README is the AC-7 provenance doc", ) def test_provenance_readme_lists_required_sections() -> None: """AC-7: README documents source URL/synthetic, license, redistribution.""" # Arrange readme = (BUILDER_DIR / "README.md").read_text() # Assert for required in ("Provenance", "License", "Reproducibility", "License-Expression: MIT".split(":")[0]): # accept "Provenance" as a section header OR "License" header if required == "Provenance": assert "## Provenance" in readme or "## Provenance (AC-7)" in readme elif required == "License": assert "License" in readme or "license" in readme elif required == "Reproducibility": assert "Reproducibility" in readme def test_faiss_index_emitted_when_faiss_available(tmp_path: Path) -> None: """AC-1: descriptors.index is bit-stable across runs (FAISS gate).""" # Arrange pytest.importorskip("faiss", reason="faiss-cpu not in test venv") out = tmp_path / "run" # Act summary = _run_builder(out) # Assert assert summary["descriptors_index_hash"] is not None, ( "faiss-cpu IS importable but builder produced no descriptors.index" ) index_path = out / "descriptors.index" assert index_path.exists() assert index_path.stat().st_size > 0