[AZ-423] [AZ-427] Add FT-P-19 + FT-N-05 blackbox tests

Implement the AC-8.6 (top-K=10 retrieval scale-ratio + scene-change PARTIAL) and AC-8.2 / AC-NEW-6 (stale aged-tile rejection) blackbox scenarios. AZ-423 (FT-P-19, 3pt) helpers + scenario: - retrieval_evaluator.py — top-K within-distance evaluator (60 stills vs 100 m budget), scene-change PARTIAL recorder (always emits PARTIAL on the 2 _gmaps.png pairs), FDR record projectors, CSV writers. - tests/positive/test_ft_p_19_sat_reloc_scale.py (6 parametrised variants). AZ-427 (FT-N-05, 2pt) helpers + scenario: - aged_tile_rejection_evaluator.py — Signal A (stale rejection at load) + Signal B (per-frame downgrade) decision matrix, reuses ALLOWED_SOURCE_LABELS from estimate_schema. - tests/negative/test_ft_n_05_stale_tile_rejection.py (12 parametrised variants: FC × VIO × {7mo/active-conflict, 13mo/rear}). 48 new unit tests cover every helper branch. Both scenarios skip when sitl_replay_ready is false and fail loudly when fixture records are missing. Per-batch review: PASS_WITH_WARNINGS (2 Low — production-dependency surface, FDR-kind constant duplication). Cumulative review 82-84: PASS (2 Low carry-over / hygiene candidate). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 11:41:12 +00:00 · 2026-05-17 15:43:06 +03:00
parent a22028087f
commit f25cae4a82
13 changed files with 2005 additions and 3 deletions
@@ -0,0 +1,168 @@
+"""FT-N-05 — Stale-tile rejection on freshness violation (AZ-427 / AC-8.2, AC-NEW-6).
+
+Two sub-cases:
+
+* ``synth-age-7mo`` mounted, SUT configured for active-conflict sector.
+* ``synth-age-13mo`` mounted, SUT configured for rear sector.
+
+For each sub-case: push the 60 still images, then assert one of:
+
+1. **Signal A** — the FDR contains at least one ``tile-load-rejected``
+   record with ``reason == "stale"`` AND no outbound emission carried
+   ``source_label = satellite_anchored``.
+2. **Signal B** — the SUT loaded the cache but every emission falls in
+   ``{visual_propagated, dead_reckoned}``.
+
+Either signal is acceptable; the AC fails the moment any frame slips
+through with ``satellite_anchored`` from these aged tiles.
+
+What this file owns:
+
+* The AC-1 / AC-2 / AC-3 wiring for both sub-cases.
+* Sector-binding declaration via ``AGED_FIXTURE_SECTOR_BINDINGS``.
+
+What this file does NOT own:
+
+* The frame-source push → ``runner.helpers.frame_source_replay`` (stub).
+* The SITL message receipt → ``runner.helpers.sitl_observer`` (stub).
+* The fixture-builder support for mounting ``synth-age-*`` tiles and
+  switching sector configuration — surfaced as a production-dependency
+  finding in the batch report (gated on ``sitl_replay_ready`` +
+  ``E2E_FT_N_05_FIXTURE`` env var; the test skips cleanly when not set
+  and fails loudly when set but the records are missing).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import accuracy_evaluator as ae
+from runner.helpers import aged_tile_rejection_evaluator as ate
+
+FT_N_05_FIXTURE_ENV = "E2E_FT_N_05_FIXTURE"
+
+GT_CSV = Path(__file__).resolve().parents[3] / "_docs" / "00_problem" / "input_data" / "coordinates.csv"
+
+
+@pytest.mark.traces_to("AC-8.2,AC-NEW-6,AC-1,AC-2,AC-3")
+@pytest.mark.parametrize(
+    "fixture, sector",
+    ate.AGED_FIXTURE_SECTOR_BINDINGS,
+    ids=[f"{f}_{s}" for f, s in ate.AGED_FIXTURE_SECTOR_BINDINGS],
+)
+def test_ft_n_05_stale_tile_rejection(
+    fixture: str,
+    sector: str,
+    fc_adapter: str,
+    vio_strategy: str,
+    evidence_dir,  # type: ignore[no-untyped-def]
+    run_id: str,
+    nfr_recorder,  # type: ignore[no-untyped-def]
+    sitl_replay_ready: bool,
+) -> None:
+    """Full FT-N-05 sub-case (AC-1 or AC-2 depending on ``fixture``)."""
+    if not sitl_replay_ready:
+        pytest.skip(
+            "FT-N-05 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay "
+            "fixture mounting one of the synth-age tile sets (AZ-595 + AZ-427 "
+            "fixture builder). Pure-logic AC-8.2/AC-NEW-6 coverage lives in "
+            "e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py."
+        )
+    active_fixture = os.environ.get(FT_N_05_FIXTURE_ENV)
+    if active_fixture is None:
+        pytest.skip(
+            f"FT-N-05 needs `{FT_N_05_FIXTURE_ENV}` env var to declare which "
+            f"synth-age fixture the fixture builder mounted "
+            f"(one of {[f for f, _ in ate.AGED_FIXTURE_SECTOR_BINDINGS]}). "
+            "The fixture builder publishes this from its per-sub-case orchestration."
+        )
+    if active_fixture != fixture:
+        pytest.skip(
+            f"FT-N-05 sub-case [{fixture}/{sector}] does not match the mounted "
+            f"fixture (`{FT_N_05_FIXTURE_ENV}` = {active_fixture!r}). "
+            "Subsequent matrix sub-cases will run in their own pytest invocations."
+        )
+
+    from runner.helpers import fdr_reader, frame_source_replay, sitl_observer
+    from runner.helpers.frame_source_replay import FrameSourceReplayer
+
+    image_paths = sorted(GT_CSV.parent.glob("AD??????.jpg"))
+    if len(image_paths) != ae.TOTAL_IMAGES_REQUIRED:
+        pytest.fail(
+            f"FT-N-05 expects {ae.TOTAL_IMAGES_REQUIRED} still images, "
+            f"found {len(image_paths)} under {GT_CSV.parent}"
+        )
+
+    sitl_host = "sitl-ardupilot" if fc_adapter == "ardupilot" else "sitl-inav"
+    observer = sitl_observer.get_observer(fc_kind=fc_adapter, host=sitl_host)
+    sink = _resolve_frame_sink()
+    replayer = FrameSourceReplayer(sink)
+
+    emissions: list[ate.SourceLabelEmission] = []
+    per_image_timeout_s = 5.0
+    for path in image_paths:
+        image_id = path.name
+        replayer.replay_image(path)
+        try:
+            msg = observer.wait_for_outbound(timeout_s=per_image_timeout_s)
+        except TimeoutError:
+            continue
+        label = getattr(msg, "source_label", None)
+        if not isinstance(label, str):
+            pytest.fail(
+                f"FT-N-05: outbound emission for {image_id} carries no `source_label` "
+                f"(got {type(label).__name__}). The SUT must include `source_label` per "
+                "FT-P-03 / AC-1.4 even on downgraded frames."
+            )
+        emissions.append(ate.SourceLabelEmission(frame_id=image_id, label=label))
+
+    fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr"
+    rejections = list(ate.iter_stale_rejection_payloads(fdr_reader.iter_records(fdr_root)))
+
+    sub_case_id = f"{fixture}_{sector}"
+    report = ate.evaluate_aged_tile_rejection(
+        sub_case_id=sub_case_id,
+        fixture=fixture,
+        sector=sector,
+        emissions=emissions,
+        rejections=rejections,
+    )
+
+    metric_prefix = f"ft_n_05.{sub_case_id}"
+    nfr_recorder.record_metric(
+        f"{metric_prefix}.emissions_observed", float(report.emissions_observed), ac_id="AC-1"
+    )
+    nfr_recorder.record_metric(
+        f"{metric_prefix}.anchored_count", float(report.anchored_count), ac_id="AC-1"
+    )
+    nfr_recorder.record_metric(
+        f"{metric_prefix}.downgrade_count", float(report.downgrade_frame_count), ac_id="AC-1"
+    )
+    nfr_recorder.record_metric(
+        f"{metric_prefix}.stale_rejection_count",
+        float(len(report.stale_rejections)),
+        ac_id="AC-1",
+    )
+
+    assert not report.illegal_labels, (
+        "FT-N-05: outbound emissions include labels outside the FT-P-03 contract: "
+        f"{report.illegal_labels}. Fix the SUT before assessing the freshness gate."
+    )
+    assert report.passes, (
+        f"FT-N-05 [{sub_case_id}] failed: signal_a={report.signal_a_holds}, "
+        f"signal_b={report.signal_b_holds}; "
+        f"anchored_frames={list(report.anchored_frame_ids)[:5]}, "
+        f"emissions={report.emissions_observed}, "
+        f"downgrade_count={report.downgrade_frame_count}, "
+        f"stale_rejections={list(report.stale_rejections)[:5]}"
+    )
+
+
+def _resolve_frame_sink():  # type: ignore[no-untyped-def]
+    """Return a replay-mode `FrameSink` (counter-only; AZ-597)."""
+    from runner.helpers.replay_mode import NullFrameSink
+
+    return NullFrameSink()
@@ -0,0 +1,149 @@
+"""FT-P-19 — Satellite-relocalization scale-ratio + scene-change PARTIAL (AZ-423 / AC-8.6).
+
+The full scenario:
+
+1. Push each ``AD0000NN.jpg`` from ``still-image-set-60`` to the SUT's
+   frame source, one at a time. For each image, the SUT runs top-K=10
+   tile-cache retrieval and writes an FDR ``retrieval-topk`` record
+   carrying the 10 candidate tile centres.
+2. The test joins the per-image GT (``coordinates.csv``) with the FDR
+   stream and asserts that EVERY image's top-K covers a tile centre
+   within 100 m of the image's true centre (``AC-1`` / set_contains).
+3. For the 2 paired ``_gmaps.png`` reference images the SUT runs the
+   cross-domain matcher; the test records per-image match success
+   into a CSV and tags the subset as ``PARTIAL`` (``AC-2``).
+4. ``AC-3`` — parameterisation across ``(fc_adapter, vio_strategy)``.
+
+What this file owns:
+
+* The AC-1 / AC-2 / AC-3 wiring above.
+* CSV evidence emission via the AZ-423-owned ``retrieval_evaluator``.
+
+What this file does NOT own:
+
+* The frame-source push → ``runner.helpers.frame_source_replay`` (stub,
+  AZ-441) — skip-gated.
+* The SITL message receipt → ``runner.helpers.sitl_observer`` (stub;
+  AZ-416/AZ-417) — skip-gated.
+* The SUT-side ``retrieval-topk`` + ``scene-change-match`` FDR record
+  emission — production gap, surfaced via fail-loud when the fixture
+  exists but the records are missing.
+
+When ``E2E_SITL_REPLAY_DIR`` is set and points at a prepared SITL
+replay fixture, this file's runtime path activates automatically.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import accuracy_evaluator as ae
+from runner.helpers import retrieval_evaluator as re_
+
+GT_CSV = Path(__file__).resolve().parents[3] / "_docs" / "00_problem" / "input_data" / "coordinates.csv"
+STILL_IMAGES_DIR = GT_CSV.parent
+
+
+@pytest.mark.traces_to("AC-8.6,AC-1,AC-2,AC-3")
+def test_ft_p_19_sat_reloc_scale(
+    fc_adapter: str,
+    vio_strategy: str,
+    evidence_dir,  # type: ignore[no-untyped-def]
+    run_id: str,
+    nfr_recorder,  # type: ignore[no-untyped-def]
+    sitl_replay_ready: bool,
+) -> None:
+    """Full FT-P-19 scenario (AC-8.6).
+
+    AC-1: every image's top-K=10 includes a tile centre within 100 m of GT.
+    AC-2: paired _gmaps.png subset → PARTIAL.
+    AC-3: parametrised across ``(fc_adapter, vio_strategy)``.
+    """
+    if not sitl_replay_ready:
+        pytest.skip(
+            "FT-P-19 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay "
+            "fixture emitting `retrieval-topk` + `scene-change-match` FDR "
+            "records (AZ-595 + AZ-423 fixture builder). Pure-logic AC-8.6 "
+            "coverage lives in e2e/_unit_tests/helpers/test_retrieval_evaluator.py."
+        )
+
+    from runner.helpers import fdr_reader
+
+    gt_rows = ae.load_gt_coordinates(GT_CSV)
+    gt_by_id = {_normalise_image_id(r.image_id): r for r in gt_rows}
+
+    fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr"
+
+    topk_queries: list[re_.TopKQuery] = []
+    for payload in re_.iter_topk_payloads(fdr_reader.iter_records(fdr_root)):
+        if not isinstance(payload, dict):
+            continue
+        image_id_raw = payload.get("image_id")
+        if not isinstance(image_id_raw, str):
+            continue
+        image_key = _normalise_image_id(image_id_raw)
+        gt = gt_by_id.get(image_key)
+        if gt is None:
+            continue
+        q = re_.project_topk_record_to_query(
+            payload, true_centre_lat_deg=gt.lat_deg, true_centre_lon_deg=gt.lon_deg
+        )
+        if q is not None:
+            topk_queries.append(q)
+
+    scene_change_matches: list[re_.SceneChangeMatch] = []
+    for payload in re_.iter_scene_change_payloads(fdr_reader.iter_records(fdr_root)):
+        m = re_.project_scene_change_record(payload)
+        if m is not None:
+            scene_change_matches.append(m)
+
+    if not topk_queries:
+        pytest.fail(
+            f"FT-P-19: no `{re_.RETRIEVAL_TOPK_FDR_KIND}` FDR records found at "
+            f"{fdr_root}. SUT must emit a top-K retrieval record per pushed "
+            "image; fixture builder must collect them."
+        )
+
+    topk_report = re_.evaluate_top_k_within_distance(
+        topk_queries, expected_image_count=ae.TOTAL_IMAGES_REQUIRED
+    )
+    scene_change_report = re_.evaluate_scene_change_subset(scene_change_matches)
+
+    top_k_csv = Path(evidence_dir) / f"ft-p-19-topk-{fc_adapter}-{vio_strategy}.csv"
+    scene_change_csv = Path(evidence_dir) / f"ft-p-19-scene-change-{fc_adapter}-{vio_strategy}.csv"
+    re_.write_top_k_csv(top_k_csv, topk_report)
+    re_.write_scene_change_csv(scene_change_csv, scene_change_report)
+
+    nfr_recorder.record_metric(
+        "ft_p_19.topk_pass_count", float(topk_report.pass_count), ac_id="AC-1"
+    )
+    nfr_recorder.record_metric(
+        "ft_p_19.topk_image_count", float(len(topk_report.entries)), ac_id="AC-1"
+    )
+    nfr_recorder.record_metric(
+        "ft_p_19.scene_change_matched_count",
+        float(scene_change_report.matched_count),
+        ac_id="AC-2",
+    )
+
+    assert topk_report.passes, (
+        f"AC-1 (top-K=10 within {topk_report.max_distance_m} m for {topk_report.expected_image_count} images) failed: "
+        f"pass_count={topk_report.pass_count}/{topk_report.expected_image_count}, "
+        f"failing={[(e.image_id, e.min_distance_m) for e in topk_report.failing_entries[:5]]}"
+    )
+    assert scene_change_report.coverage_complete, (
+        "AC-2 (scene-change subset coverage) failed: paired images observed = "
+        f"{[e.image_id for e in scene_change_report.entries]}, expected = "
+        f"{list(scene_change_report.expected_image_ids)}"
+    )
+    assert scene_change_report.overall_label == re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL, (
+        f"AC-2 (PARTIAL annotation): got {scene_change_report.overall_label!r}, "
+        f"expected {re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL!r}"
+    )
+
+
+def _normalise_image_id(image_id: str) -> str:
+    """Strip ``.jpg`` extension if present; ``coordinates.csv`` uses the bare stem."""
+    return image_id[:-4] if image_id.lower().endswith(".jpg") else image_id