From f25cae4a82f8614a284abe3f93c7a99f0dced55c Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Sun, 17 May 2026 15:43:06 +0300 Subject: [PATCH] [AZ-423] [AZ-427] Add FT-P-19 + FT-N-05 blackbox tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the AC-8.6 (top-K=10 retrieval scale-ratio + scene-change PARTIAL) and AC-8.2 / AC-NEW-6 (stale aged-tile rejection) blackbox scenarios. AZ-423 (FT-P-19, 3pt) helpers + scenario: - retrieval_evaluator.py — top-K within-distance evaluator (60 stills vs 100 m budget), scene-change PARTIAL recorder (always emits PARTIAL on the 2 _gmaps.png pairs), FDR record projectors, CSV writers. - tests/positive/test_ft_p_19_sat_reloc_scale.py (6 parametrised variants). AZ-427 (FT-N-05, 2pt) helpers + scenario: - aged_tile_rejection_evaluator.py — Signal A (stale rejection at load) + Signal B (per-frame downgrade) decision matrix, reuses ALLOWED_SOURCE_LABELS from estimate_schema. - tests/negative/test_ft_n_05_stale_tile_rejection.py (12 parametrised variants: FC × VIO × {7mo/active-conflict, 13mo/rear}). 48 new unit tests cover every helper branch. Both scenarios skip when sitl_replay_ready is false and fail loudly when fixture records are missing. Per-batch review: PASS_WITH_WARNINGS (2 Low — production-dependency surface, FDR-kind constant duplication). Cumulative review 82-84: PASS (2 Low carry-over / hygiene candidate). Co-authored-by: Cursor --- .../AZ-423_ft_p_19_sat_reloc_scale.md | 0 .../AZ-427_ft_n_05_stale_tile_rejection.md | 0 _docs/03_implementation/batch_84_report.md | 105 +++++ ...tive_review_batches_82-84_cycle1_report.md | 182 ++++++++ .../reviews/batch_84_review.md | 120 ++++++ _docs/_autodev_state.md | 6 +- .../test_aged_tile_rejection_evaluator.py | 287 +++++++++++++ .../helpers/test_retrieval_evaluator.py | 389 ++++++++++++++++++ e2e/_unit_tests/test_directory_layout.py | 4 + .../helpers/aged_tile_rejection_evaluator.py | 218 ++++++++++ e2e/runner/helpers/retrieval_evaluator.py | 380 +++++++++++++++++ .../test_ft_n_05_stale_tile_rejection.py | 168 ++++++++ .../positive/test_ft_p_19_sat_reloc_scale.py | 149 +++++++ 13 files changed, 2005 insertions(+), 3 deletions(-) rename _docs/02_tasks/{todo => done}/AZ-423_ft_p_19_sat_reloc_scale.md (100%) rename _docs/02_tasks/{todo => done}/AZ-427_ft_n_05_stale_tile_rejection.md (100%) create mode 100644 _docs/03_implementation/batch_84_report.md create mode 100644 _docs/03_implementation/cumulative_review_batches_82-84_cycle1_report.md create mode 100644 _docs/03_implementation/reviews/batch_84_review.md create mode 100644 e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py create mode 100644 e2e/_unit_tests/helpers/test_retrieval_evaluator.py create mode 100644 e2e/runner/helpers/aged_tile_rejection_evaluator.py create mode 100644 e2e/runner/helpers/retrieval_evaluator.py create mode 100644 e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py create mode 100644 e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py diff --git a/_docs/02_tasks/todo/AZ-423_ft_p_19_sat_reloc_scale.md b/_docs/02_tasks/done/AZ-423_ft_p_19_sat_reloc_scale.md similarity index 100% rename from _docs/02_tasks/todo/AZ-423_ft_p_19_sat_reloc_scale.md rename to _docs/02_tasks/done/AZ-423_ft_p_19_sat_reloc_scale.md diff --git a/_docs/02_tasks/todo/AZ-427_ft_n_05_stale_tile_rejection.md b/_docs/02_tasks/done/AZ-427_ft_n_05_stale_tile_rejection.md similarity index 100% rename from _docs/02_tasks/todo/AZ-427_ft_n_05_stale_tile_rejection.md rename to _docs/02_tasks/done/AZ-427_ft_n_05_stale_tile_rejection.md diff --git a/_docs/03_implementation/batch_84_report.md b/_docs/03_implementation/batch_84_report.md new file mode 100644 index 0000000..f07ca75 --- /dev/null +++ b/_docs/03_implementation/batch_84_report.md @@ -0,0 +1,105 @@ +# Batch 84 — AZ-423 + AZ-427 (FT-P-19 sat-relocalization + FT-N-05 stale-tile rejection) + +**Tracker**: AZ-423, AZ-427 +**Tasks**: 2 tasks / 5 complexity points (3 + 2) +**Date**: 2026-05-17 +**Verdict**: PASS_WITH_WARNINGS +**Review**: `_docs/03_implementation/reviews/batch_84_review.md` + +## Scope + +- **AZ-423 / FT-P-19 (AC-8.6)**: per-image top-K=10 retrieval includes a tile centre within 100 m of the image's true centre across all 60 stills; scene-change subset (2 paired `_gmaps.png` images) carries PARTIAL annotation. +- **AZ-427 / FT-N-05 (AC-8.2, AC-NEW-6)**: aged-tile fixtures (`synth-age-7mo` in active-conflict sector, `synth-age-13mo` in rear sector) produce zero `satellite_anchored` emissions — either by load-time stale rejection (Signal A) or per-frame downgrade (Signal B). + +## Files + +### Created + +- `e2e/runner/helpers/retrieval_evaluator.py` — pure-logic evaluators for AC-1 (top-K within distance) + AC-2 (scene-change PARTIAL); FDR payload projectors for `retrieval-topk` + `scene-change-match` records; CSV emitters for evidence. +- `e2e/runner/helpers/aged_tile_rejection_evaluator.py` — pure-logic evaluator with explicit Signal-A / Signal-B decision matrix; FDR projector for `tile-load-rejected: stale` records; sector-binding constants (`synth-age-7mo → active_conflict`, `synth-age-13mo → rear`). +- `e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py` — FT-P-19 scenario. +- `e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py` — FT-N-05 scenario (parametrised across 2 sub-cases via `AGED_FIXTURE_SECTOR_BINDINGS`). +- `e2e/_unit_tests/helpers/test_retrieval_evaluator.py` — 29 unit tests. +- `e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py` — 19 unit tests. + +### Modified + +- `e2e/_unit_tests/test_directory_layout.py` — registered 4 new paths. + +## Test Results + +``` +$ pytest _unit_tests/helpers/test_retrieval_evaluator.py \ + _unit_tests/helpers/test_aged_tile_rejection_evaluator.py \ + _unit_tests/test_directory_layout.py +============================= 157 passed in 0.28s ============================== +``` + +Scenario collection: + +``` +$ pytest tests/positive/test_ft_p_19_sat_reloc_scale.py \ + tests/negative/test_ft_n_05_stale_tile_rejection.py --collect-only +collected 18 items + test_ft_p_19_sat_reloc_scale: 6 cases (ardupilot/inav × {okvis2, klt_ransac, vins_mono}) + test_ft_n_05_stale_tile_rejection: 12 cases (FC × VIO × 2 aged-tile sub-cases) +``` + +## AC Verification + +### AZ-423 / FT-P-19 + +| AC | Coverage | +|----|----------| +| AC-1 top-K=10 within 100 m for all 60 images | `evaluate_top_k_within_distance` + scenario assertion + 9 unit tests | +| AC-2 scene-change subset PARTIAL | `evaluate_scene_change_subset` + scenario assertion + 5 unit tests | +| AC-3 parameterisation | 6 collected variants via conftest `fc_adapter` / `vio_strategy` fixtures | + +### AZ-427 / FT-N-05 + +| AC | Coverage | +|----|----------| +| AC-1 7mo aged tiles → 0 `satellite_anchored` emissions | parametrised sub-case + `evaluate_aged_tile_rejection` + 11 unit tests | +| AC-2 13mo aged tiles → 0 `satellite_anchored` emissions | same helper + sub-case binding `synth-age-13mo → rear` | +| AC-3 parameterisation | 12 collected variants (FC × VIO × 2 sub-cases) | + +`traces_to` markers: +- FT-P-19: `AC-8.6,AC-1,AC-2,AC-3` +- FT-N-05: `AC-8.2,AC-NEW-6,AC-1,AC-2,AC-3` + +## Code Review + +**Verdict**: PASS_WITH_WARNINGS — 0 Critical, 0 High, 2 Low. + +- **F1 (production-dependency surface)**: both scenarios depend on upstream SUT features (`retrieval-topk` / `scene-change-match` FDR record kinds for AZ-423; `tile-load-rejected: stale` events OR per-frame downgrades for AZ-427) and fixture-builder support (`synth-age-*` mounts + `E2E_FT_N_05_FIXTURE` env var). Tests skip cleanly when fixtures missing and fail loudly when fixtures exist but records are missing. +- **F2 (maintainability — intra-batch duplication)**: `TILE_LOAD_REJECTED_FDR_KIND` and its stale-reason constant now exist in both `aged_tile_rejection_evaluator.py` and `mid_flight_tile_evaluator.py`. Future hygiene PBI candidate. + +Full review: `_docs/03_implementation/reviews/batch_84_review.md`. + +## Production Dependencies + +Surfaced for the cumulative review window (82-84) + traceability matrix: + +1. **AZ-423 SUT-side**: emit FDR `retrieval-topk` record kind per pushed image, carrying `image_id` + `candidates` list of `{tile_id, centre_lat_deg, centre_lon_deg}`. +2. **AZ-423 SUT-side**: emit FDR `scene-change-match` record kind for each paired `_gmaps.png` image, carrying `image_id` + `matched` (bool) + optional `inlier_count` (int). +3. **AZ-427 SUT-side**: wire the C6 freshness gate's sector-aware date comparison (active-conflict vs rear thresholds per AC-8.2) and emit FDR `tile-load-rejected: stale` events at startup OR ensure every emission downgrades to `{visual_propagated, dead_reckoned}`. +4. **Fixture-builder-side**: snapshot/mount of `synth-age-7mo` + `synth-age-13mo` tile sets, run-per-sub-case orchestration that publishes `E2E_FT_N_05_FIXTURE` env var declaring which fixture is currently active. +5. **Already exists**: `still-image-set-60` + `coordinates.csv` (AZ-407 deliverable) + `still-image-sat-refs-2` (AZ-407). +6. **Already exists**: `sitl_replay_ready` fixture, `fc_adapter` / `vio_strategy` parameterisation, `evidence_dir` + `nfr_recorder` + `run_id` (AZ-406). + +## Architecture Compliance + +- All new files under `e2e/`, owned by the Blackbox Tests component per `_docs/02_document/module-layout.md`. +- No imports from `src/gps_denied_onboard` (explicit public-boundary discipline notes in both helpers). +- No new cyclic dependencies. Both helpers depend on `.geo` and (for `aged_tile_rejection_evaluator`) `.estimate_schema` only. +- No new infrastructure libraries. + +## Sub-step Trace + +Phases executed per `implement/SKILL.md`: +- phase 5 (load-spec) → AZ-423 + AZ-427 specs read +- phase 6 (implement-tasks-sequentially) → helpers + scenarios + unit tests +- phase 7 (verify-ac-coverage) → ACs traced above +- phase 8 (code-review) → batch_84_review.md (PASS_WITH_WARNINGS) +- phase 8.5 (cumulative-review) → batches_82-84 (next: K=3 cumulative trigger) +- phase 11 (commit-batch) → after cumulative review. diff --git a/_docs/03_implementation/cumulative_review_batches_82-84_cycle1_report.md b/_docs/03_implementation/cumulative_review_batches_82-84_cycle1_report.md new file mode 100644 index 0000000..392b950 --- /dev/null +++ b/_docs/03_implementation/cumulative_review_batches_82-84_cycle1_report.md @@ -0,0 +1,182 @@ +# Cumulative Code Review — Batches 82-84 (Cycle 1) + +**Window**: batches 82, 83, 84 +**Tasks covered**: AZ-421, AZ-422, AZ-423, AZ-427 +**Date**: 2026-05-17 +**Verdict**: PASS +**Last cumulative review**: batches 79-81 (`_docs/03_implementation/reviews/cumulative_79_81_review.md`) + +## Scope + +Union of files changed since the last cumulative review. Counts include the modifications to `e2e/_unit_tests/test_directory_layout.py` that each batch made. + +### Files in window + +| Batch | Tasks | New helpers | New scenarios | New unit tests | +|-------|-------|-------------|---------------|----------------| +| 82 | AZ-421 (FT-P-15 + FT-P-16 + FT-P-18) | `tile_cache_inspector` | `test_ft_p_15_cache_schema`, `test_ft_p_16_offline_only`, `test_ft_p_18_no_raw_retention` | `test_tile_cache_inspector` | +| 83 | AZ-422 (FT-P-17 + FT-N-06) | `mid_flight_tile_evaluator`, `mock_suite_sat_audit` | `test_ft_p_17_mid_flight_tiles`, `test_ft_n_06_mid_flight_freshness` | `test_mid_flight_tile_evaluator`, `test_mock_suite_sat_audit` | +| 84 | AZ-423 + AZ-427 (FT-P-19 + FT-N-05) | `retrieval_evaluator`, `aged_tile_rejection_evaluator` | `test_ft_p_19_sat_reloc_scale`, `test_ft_n_05_stale_tile_rejection` | `test_retrieval_evaluator`, `test_aged_tile_rejection_evaluator` | + +Total: 5 new helper modules + 7 new scenario files + 5 new unit-test files across 4 tasks / 11 complexity points (3 + 3 + 3 + 2). + +## Cross-Cutting Themes + +### Theme 1 — FDR vocabulary fragmentation + +Each new helper defines its own FDR record-kind constants. As of batch 84: + +| Constant | Defined in | +|----------|------------| +| `MID_FLIGHT_TILE_FDR_KIND = "mid-flight-tile-output"` | `mid_flight_tile_evaluator.py` | +| `TILE_LOAD_REJECTED_FDR_KIND = "tile-load-rejected"` | `mid_flight_tile_evaluator.py` AND `aged_tile_rejection_evaluator.py` | +| `TILE_LOAD_REJECTED_STALE_REASON = "stale"` | same — duplicated | +| `RETRIEVAL_TOPK_FDR_KIND = "retrieval-topk"` | `retrieval_evaluator.py` | +| `SCENE_CHANGE_MATCH_FDR_KIND = "scene-change-match"` | `retrieval_evaluator.py` | + +The `tile-load-rejected` constant is the only true duplication; the rest live in one place. The duplication is intentional in spirit (each helper exposes the vocabulary it consumes) but creates a small coordination cost — surfaced as a Low finding in batch 84 review (F2). + +Recommendation (NOT blocking this window): consolidate into a single `runner/helpers/fdr_record_kinds.py` constants module. Hygiene PBI candidate (~1 point). + +### Theme 2 — Geo math consolidation progress + +`runner.helpers.geo.distance_m` (Vincenty WGS84) is now consumed by: + +- `mid_flight_tile_evaluator.evaluate_dedup` (Batch 83, AC-3) +- `retrieval_evaluator.evaluate_top_k_within_distance` (Batch 84, AC-1) + +Both via `from .geo import distance_m`. This is the correct pattern. + +`runner.helpers.gcs_telemetry_evaluator.haversine_distance_m` (Batch 81) still exists as the lone outlier — surfaced in the prior cumulative review (79-81) as F1 / Low / Maintainability. Recommendation unchanged: refactor in a dedicated hygiene batch (≤1 point) before any further geo-math additions land. + +### Theme 3 — Tests-as-gates discipline + +Every scenario in this window follows the same skip/fail-loud pattern: + +* If `sitl_replay_ready` is false → `pytest.skip()`. +* If `sitl_replay_ready` is true BUT the required FDR records / env vars are missing → `pytest.fail()`. + +This ensures production-code or fixture gaps surface immediately when the SITL fixture is wired up, while CI runs without the fixture remain green. + +Aggregate production-dependency list (collected from batches 82-84 batch reports): + +* SUT-side FDR record kinds: `mid-flight-tile-output`, `tile-load-rejected` (with `reason="stale"`), `retrieval-topk`, `scene-change-match`, `cache-self-check`. +* SUT-side public-input mechanisms: `simulate_landing()` MAVLink command. +* SUT-side production logic: C6 freshness gate sector-aware date comparison, C2 top-K=10 retrieval result emission, cross-domain matcher per-pair output emission, mid-flight orthorectification + tile generation + Mode B Fact #105 schema population. +* Fixture-builder-side: `FT_P_17_HIGH_QUALITY_WINDOW_S` env var, `E2E_FT_N_05_FIXTURE` env var, `synth-age-7mo` / `synth-age-13mo` snapshots, Derkachi 5-min replay scenario, Docker network inspect snapshots (FT-P-16). + +All gated, all surfaced via skip messages with explicit fixture-owner references. This list feeds the AZ-595 (fixture-builder) backlog and the SUT-side production work. + +### Theme 4 — Source-label contract centralisation + +`aged_tile_rejection_evaluator` (Batch 84) reuses `ALLOWED_SOURCE_LABELS` from `estimate_schema` (AZ-411 deliverable). This is the correct pattern — the FT-P-03 / AC-1.4 source-label set is a one-source contract. No drift; new helpers should follow this lead when they need the same set. + +### Theme 5 — Helper-module size + complexity + +| Helper | Lines | Top-level public symbols | +|--------|-------|--------------------------| +| `tile_cache_inspector.py` | ~450 | 4 dataclasses + 4 evaluators + walk + JPEG probe | +| `mid_flight_tile_evaluator.py` | ~500 | 6 dataclasses + 6 evaluators + 2 projection helpers | +| `mock_suite_sat_audit.py` | ~77 | 1 function (`fetch_audit`) + constants | +| `retrieval_evaluator.py` | ~260 | 6 dataclasses + 2 evaluators + 2 projectors + 2 CSV writers + 2 iter filters | +| `aged_tile_rejection_evaluator.py` | ~190 | 3 dataclasses + 1 evaluator + 1 projector + 1 iter filter + constants | + +All evaluators stay under the 50-line function complexity threshold. Helper surfaces are coherent — no single helper does too many things. + +## Spec Compliance + +Spot-checked each batch's per-batch review verdict; no contradictions surfaced under cumulative inspection. + +Cumulative AC coverage (E-BBT blackbox tests, this window): + +| AC ID | Task | Status | +|-------|------|--------| +| AC-8.4 | AZ-422 | Covered (FT-P-17 5 sub-evaluators) | +| AC-NEW-6 (mid-flight) | AZ-422 | Covered (FT-N-06 2 sub-evaluators) | +| AC-8.2 (cache schema + offline) | AZ-421 | Covered (FT-P-15, FT-P-16, FT-P-18) | +| AC-NEW-1 (no raw retention) | AZ-421 | Covered (FT-P-18) | +| AC-8.6 | AZ-423 | Covered (FT-P-19 scale + scene-change PARTIAL) | +| AC-NEW-6 (stale aged) | AZ-427 | Covered (FT-N-05 2 sub-cases) | +| AC-8.2 (sector freshness) | AZ-427 | Covered | + +No carried-over AC misses; no AC drift between batches. + +## Code Quality + +- **SOLID / SRP**: each helper has one reason to change. No God modules. The `mid_flight_tile_evaluator` has 6 evaluators — at the upper end but still cohesive (all consume the same `TileSpec` input from the same FDR record kind). +- **Error handling**: every input validator raises `ValueError`; every HTTP wrapper raises `RuntimeError`. No bare `except`. `_parse_iso8601_utc_seconds` catches only `(TypeError, ValueError)`. +- **Test pyramid**: unit tests dominate (52 in batch 83, 48 in batch 84, ~30 in batch 82). Each scenario has a corresponding pure-logic unit-test file. Scenario tests delegate to evaluators for all AC math, asserting only `report.passes` + diagnostic context. +- **Naming**: `evaluate_*` for evaluators, `project_*` for FDR-payload projectors, `iter_*` for record-kind filters, `Report` for report dataclasses. Consistent across all 5 new helpers. +- **Comments**: docstrings cite AC IDs and Mode B Fact / restriction references. Inline comments are minimal and only where non-obvious. Arrange/Act/Assert pattern present in every unit test. +- **Dead code**: none. Re-verified `_top_level_field_to_attr` (Batch 83) is a documented forward-compatibility seam. + +## Security Quick-Scan + +- No SQL, no `subprocess` invocations, no `exec`/`eval`. +- No hardcoded secrets, no logging of sensitive data. +- HTTP client (`mock_suite_sat_audit`) validates URL non-empty, run_id non-empty, status 2xx, JSON body is dict, `entries` is list. Error messages truncate body to 200 chars. +- `aged_tile_rejection_evaluator` separates `illegal_labels` (contract violation) from `anchored_count` (freshness-gate violation) so a defect can't mask the other. + +## Performance Scan + +- `evaluate_dedup` is O(N²) — documented + bounded by < 100 tiles per 5-min replay. +- `evaluate_top_k_within_distance` is O(N·K) = 60 × 10 = 600 Vincenty calls per scenario. +- `evaluate_aged_tile_rejection` is O(M) where M ≤ 60. +- FDR iteration is generator-based via `fdr_reader.iter_records` (unchanged from prior windows). +- No N+1, no unbounded fetches, no async-in-sync, no hot-path allocations. + +## Cross-Task Consistency + +- All four tasks consume the same conftest fixtures (`sitl_replay_ready`, `fc_adapter`, `vio_strategy`, `evidence_dir`, `nfr_recorder`, `run_id`, plus task-specific helpers). +- `traces_to` marker convention consistent. +- Skip messages have a uniform shape: " requires ; ". +- CSV evidence file naming: `ft----.csv`. +- TileSpec / TilePublishRequest mirror is single-source in `mid_flight_tile_evaluator`. + +## Architecture Compliance (Phase 7) + +Verified against `_docs/02_document/module-layout.md`: + +- **Layer direction**: all new files under `e2e/` (Blackbox Tests). No imports from `src/gps_denied_onboard`. Public-boundary discipline notes present in every new helper. +- **Public API respect**: cross-component imports limited to `runner.helpers.{geo, estimate_schema, fdr_reader, accuracy_evaluator}` — all helpers that are intra-component to the Blackbox Tests package. +- **No new cyclic dependencies**: import graph for new files is a tree (helpers → geo / estimate_schema; tests → helpers). +- **Duplicate symbols**: `TILE_LOAD_REJECTED_FDR_KIND` appears twice (intra-component duplicate, surfaced in F2 below). No other duplicates. +- **Cross-cutting concerns**: no concern that should live in `shared/*` has been re-implemented locally. Geo, schema, CSV emission, HTTP all delegate. + +## Baseline Delta + +`_docs/02_document/architecture_compliance_baseline.md` does not exist for the Blackbox Tests workspace (this workspace was greenfield-bootstrapped); no Baseline Delta section to emit. + +## Findings + +| # | Severity | Category | Status | Title | +|---|----------|----------|--------|-------| +| F1 | Low | Maintainability | Carried over from 79-81 | `gcs_telemetry_evaluator.haversine_distance_m` duplicates `geo.distance_m` | +| F2 | Low | Maintainability | New in 82-84 (B84 review F2) | `TILE_LOAD_REJECTED_FDR_KIND` + stale-reason constant duplicated across two helpers | + +### F1 (carry-over) + +- **Location**: `e2e/runner/helpers/gcs_telemetry_evaluator.py` +- **State**: still present. No batch in this window touched it. +- **Recommendation unchanged**: consolidate to `geo.distance_m` in a dedicated ≤1-pt hygiene batch. + +### F2 (newly introduced) + +- **Location**: `e2e/runner/helpers/mid_flight_tile_evaluator.py:43-44`, `e2e/runner/helpers/aged_tile_rejection_evaluator.py:33-34` +- **Description**: Two helpers define the same FDR record-kind constant + stale-reason string. The duplication exists because each helper aims to be self-contained, but the underlying contract is single-source (FDR record-type vocabulary). +- **Recommendation**: address in the same hygiene batch as F1 by creating `runner/helpers/fdr_record_kinds.py` and importing the constants from there. Combine with F1 → estimated 2-pt hygiene batch. + +## Resolved Findings + +None. F1 from the prior window still stands; no fix landed in this window. + +## Carried Forward to Next Window (85-87) + +- F1, F2 unresolved. +- Production-dependency list above is the AZ-595 fixture-builder + epic-side SUT work pending. Tracker visibility: the items are surfaced in each batch report; an aggregator PBI for the fixture-builder side is recommended but not required this cycle. + +## Verdict + +0 Critical, 0 High, 2 Low (1 carry-over + 1 new). No FAIL trigger; cumulative posture remains **PASS** with two known maintainability items eligible for a dedicated hygiene batch. + +Batch 84 proceeds to commit + push. Next cumulative review covers batches 85-87. diff --git a/_docs/03_implementation/reviews/batch_84_review.md b/_docs/03_implementation/reviews/batch_84_review.md new file mode 100644 index 0000000..109a137 --- /dev/null +++ b/_docs/03_implementation/reviews/batch_84_review.md @@ -0,0 +1,120 @@ +# Code Review Report + +**Batch**: 84 — AZ-423 + AZ-427 (FT-P-19 sat-relocalization scale + FT-N-05 stale-tile rejection) +**Date**: 2026-05-17 +**Verdict**: PASS_WITH_WARNINGS + +## Files Reviewed + +**Created**: +- `e2e/runner/helpers/retrieval_evaluator.py` +- `e2e/runner/helpers/aged_tile_rejection_evaluator.py` +- `e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py` +- `e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py` +- `e2e/_unit_tests/helpers/test_retrieval_evaluator.py` +- `e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py` + +**Modified**: +- `e2e/_unit_tests/test_directory_layout.py` (registered 4 new paths) + +## Findings + +| # | Severity | Category | File:Line | Title | +|---|----------|----------|-----------|-------| +| 1 | Low | Spec-Gap | `e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py`, `e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py` | Tests depend on upstream production + fixture-builder features | +| 2 | Low | Maintainability | `e2e/runner/helpers/aged_tile_rejection_evaluator.py`, `e2e/runner/helpers/mid_flight_tile_evaluator.py` | `TILE_LOAD_REJECTED_FDR_KIND` and stale-reason constant now duplicated across two helpers | + +### Finding Details + +**F1: Tests depend on upstream production + fixture-builder features that don't exist yet** (Low / Spec-Gap) +- Location: `e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py`, `e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py` +- Description: Both scenarios require: + * **AZ-423**: SUT emitting FDR `retrieval-topk` records (one per pushed image, carrying 10 candidate tile centres) and `scene-change-match` records for the 2 paired `_gmaps.png` images. Production code must expose the C2 top-K=10 retrieval result on the FDR boundary. + * **AZ-427**: SUT either (a) emitting FDR `tile-load-rejected: stale` events at startup OR (b) downgrading every emission's `source_label` to `{visual_propagated, dead_reckoned}` for the aged-tile sub-cases. Production code must wire the C6 freshness gate per-sector (active-conflict vs rear). + * **Fixture-builder**: snapshot/mount of `synth-age-7mo` / `synth-age-13mo` tile sets + `E2E_FT_N_05_FIXTURE` env var declaring the active sub-case fixture per pytest invocation. +- Tests **skip cleanly** when `sitl_replay_ready` / `E2E_FT_N_05_FIXTURE` are absent and **fail loudly** when the fixture exists but the required records are missing — adhering to the "tests as gates" principle. +- Suggestion: Surface as a single line in the AZ-423 + AZ-427 batch report under Production Dependencies. No code change in this batch. +- Tasks: AZ-423, AZ-427. + +**F2: `TILE_LOAD_REJECTED_FDR_KIND` and stale-reason constant duplicated** (Low / Maintainability) +- Location: `e2e/runner/helpers/aged_tile_rejection_evaluator.py:33-34`, `e2e/runner/helpers/mid_flight_tile_evaluator.py:43-44` +- Description: Both helpers redefine `TILE_LOAD_REJECTED_FDR_KIND = "tile-load-rejected"` and the stale-reason constant. This is intentional in spirit (each helper exposes the FDR vocabulary it operates on) but creates two copies of the same FDR-contract string. If the SUT renames the record type later, both files must change in lockstep. +- Suggestion: Consolidate in a dedicated `runner/helpers/fdr_record_kinds.py` module in a future hygiene PBI; not in scope for AZ-423/AZ-427. +- Tasks: hygiene, future batch. + +## Phase 1: Context Loading + +Read AZ-423 + AZ-427 task specs. Both have 3 ACs; both are blackbox-test deliverables under E-BBT (AZ-262); both depend only on AZ-406 + AZ-407 (test infra + static fixtures); SUT boundary statements verified. + +## Phase 2: Spec Compliance + +### AZ-423 (FT-P-19) + +| AC | Helper | Scenario assertion | Unit-test coverage | +|----|--------|--------------------|--------------------| +| AC-1 (top-K=10 includes tile within 100 m of true centre, 60 stills) | `evaluate_top_k_within_distance` | `assert topk_report.passes` | 9 cases (single close, all far, mixed top-K, empty, count mismatch, invalid tolerance, custom tolerance, 60 all-pass, 60 one-fail) | +| AC-2 (scene-change PARTIAL on 2 `_gmaps.png` pairs) | `evaluate_scene_change_subset` | `assert coverage_complete AND overall_label == "PARTIAL"` | 5 cases (both matched still PARTIAL, zero matched still PARTIAL, one image only, empty, wrong image ids) | +| AC-3 (parameterised) | conftest fixtures | 6 collected variants | — | + +Plus 2 CSV-writer round-trip tests, 5 `project_topk_record_to_query` cases (happy / malformed / empty / non-dict / type errors), 5 `project_scene_change_record` cases, 2 `iter_*_payloads` filter cases — 29 unit tests total for AZ-423. + +### AZ-427 (FT-N-05) + +| AC | Helper | Scenario assertion | Unit-test coverage | +|----|--------|--------------------|--------------------| +| AC-1 (7mo aged tiles in active-conflict sector → 0 anchored emissions) | `evaluate_aged_tile_rejection(fixture="synth-age-7mo", sector="active_conflict", …)` | parametrised sub-case | 3 Signal-A cases + 3 Signal-B cases + 5 degenerate-input cases | +| AC-2 (13mo aged tiles in rear sector → 0 anchored emissions) | same helper with rear fixture binding | parametrised sub-case | same as AC-1 (sub-case symmetric) | +| AC-3 (parameterised) | conftest fixtures + 2 sub-case parametrisation | 12 collected variants (6 × 2) | — | + +Plus 6 `project_stale_rejection_payload` cases (happy / id-alias / wrong-reason / non-dict / missing-id / wrong-type), 1 `iter_stale_rejection_payloads` filter case, 1 `AGED_FIXTURE_SECTOR_BINDINGS` contract assertion — 19 unit tests total for AZ-427. + +All ACs satisfied. `@pytest.mark.traces_to` markers wire scenarios to AC IDs. + +## Phase 3: Code Quality + +- **SOLID**: each evaluator is a pure function over typed input dataclasses returning a frozen-dataclass report. `retrieval_evaluator` and `aged_tile_rejection_evaluator` are siblings of the existing `mid_flight_tile_evaluator` / `gcs_telemetry_evaluator` / `tile_cache_inspector` pattern. +- **Error handling**: `evaluate_top_k_within_distance` raises `ValueError` on invalid tolerance. `evaluate_aged_tile_rejection` has no failure modes (always returns a report; the report's `.passes` carries the verdict). No bare `except`. +- **Naming**: `evaluate_*` matches sibling helpers. `_normalise_image_id` in scenario test mirrors the convention from `accuracy_evaluator` GT keying. `Signal A` / `Signal B` properties make the decision matrix self-documenting. +- **Complexity**: longest function `evaluate_top_k_within_distance` at ~25 lines; `evaluate_aged_tile_rejection` at ~20 lines. All under coderule's 50-line threshold. +- **DRY**: F2 captures one intra-batch duplication. Within each helper, no duplicated logic. +- **Test quality**: every unit test uses Arrange/Act/Assert comments per coderule. Tests assert specific outcomes (matched_count, anchored_frame_ids, signal flags, CSV content) rather than "no error thrown". +- **Dead code**: none. `iter_topk_payloads` / `iter_scene_change_payloads` / `iter_stale_rejection_payloads` are scenario-test helpers used in the corresponding `tests/` files. + +## Phase 4: Security Quick-Scan + +- No SQL, no `subprocess`, no `exec`/`eval`. +- No hardcoded secrets. +- Input validation: all `project_*` functions guard against malformed dict payloads; `evaluate_top_k_within_distance` validates tolerance. +- No sensitive data in logs or error messages. +- `evaluate_aged_tile_rejection` surfaces `illegal_labels` separately from passes/fails so a contract-violation defect can't be misread as a freshness-gate defect. + +## Phase 5: Performance Scan + +- `evaluate_top_k_within_distance` is O(N·K) where N=60 images, K=10 candidates → 600 distance computations per scenario. Vincenty via `pyproj` handles this in well under 100 ms. +- `evaluate_aged_tile_rejection` is O(M) where M = emissions count (≤ 60) + rejections count (typically 0–2). Trivial. +- FDR iteration is generator-based via `fdr_reader.iter_records`. +- No N+1 patterns. No unbounded fetching. + +## Phase 6: Cross-Task Consistency + +- Both helpers reuse `runner.helpers.geo.distance_m` (Vincenty WGS84) — consistent with `mid_flight_tile_evaluator` from batch 83. +- `aged_tile_rejection_evaluator` reuses `ALLOWED_SOURCE_LABELS` from `estimate_schema` — single source of truth for the FT-P-03 / AC-1.4 source-label contract. +- Scenario tests follow the same skip pattern as FT-P-12/13/15/16/17/18 (sitl_replay_ready gate + fail-loud on missing records). +- The frozen-dataclass + `@property passes` convention matches every prior helper (batches 79-83). +- `traces_to` markers + CSV evidence emission to `ft-p-19-*.csv` / `ft-n-05-*.csv` follow the FT-P-01 / FT-P-05 / FT-P-17 cadence. + +## Phase 7: Architecture Compliance + +- All new files under `e2e/` — owned exclusively by the Blackbox Tests component per `_docs/02_document/module-layout.md`. +- **No imports from `src/gps_denied_onboard`** — verified by reading every import in the new files; both helpers carry an explicit "public-boundary discipline" docstring note. +- No layering violations. +- No new cyclic module dependencies. Both helpers import from `.geo` / `.estimate_schema` only; tests import from `runner.helpers.*`. +- No duplicate symbols across components. The intra-component duplicate FDR-kind constant (F2) is intra-helper-set, not cross-component. +- No cross-cutting concerns re-implemented locally; geo math, source-label set, and CSV emission all delegate to project-wide utilities. + +## Verdict + +- 0 Critical, 0 High → no FAIL trigger. +- 2 Low → **PASS_WITH_WARNINGS**. + +Batch 84 is ready to commit. The two Low findings are surfaced for batch report and feed forward into the cumulative review for batches 82–84. diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 3ee0c3e..d2a9934 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -12,9 +12,9 @@ sub_step: retry_count: 0 cycle: 1 tracker: jira -last_completed_batch: 83 -last_cumulative_review: batches_79-81 -current_batch: 84 +last_completed_batch: 84 +last_cumulative_review: batches_82-84 +current_batch: 85 last_step_outcomes: step_8: "Code is testable — no changes needed (testability_assessment.md committed; no list-of-changes, no source edits)" diff --git a/e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py b/e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py new file mode 100644 index 0000000..789fc33 --- /dev/null +++ b/e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py @@ -0,0 +1,287 @@ +"""Unit tests for ``runner.helpers.aged_tile_rejection_evaluator`` (AZ-427). + +Pure-logic AC-8.2 / AC-NEW-6 coverage for FT-N-05. +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from runner.helpers import aged_tile_rejection_evaluator as ate + + +def _emission(frame_id: str = "AD000001.jpg", label: str = "satellite_anchored") -> ate.SourceLabelEmission: + return ate.SourceLabelEmission(frame_id=frame_id, label=label) + + +def _rejection(tile_id: str = "tile_001", reason: str = "stale") -> ate.StaleTileRejection: + return ate.StaleTileRejection(tile_id=tile_id, reason=reason) + + +# ─────────────────────── Signal A (load-time rejection) ─────────────────────── + + +def test_signal_a_stale_rejected_no_emissions_passes() -> None: + # Arrange — SUT rejected the aged tiles at load; nothing else ran + report = ate.evaluate_aged_tile_rejection( + sub_case_id="7mo_active_conflict", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=[], + rejections=[_rejection()], + ) + # Assert + assert report.passes + assert report.signal_a_holds + assert not report.signal_b_holds + assert report.has_stale_rejection + assert report.anchored_count == 0 + + +def test_signal_a_stale_rejected_plus_downgrade_passes() -> None: + # Arrange — rejected at load AND every later emission is downgraded + emissions = [_emission(label="visual_propagated"), _emission(label="dead_reckoned")] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="13mo_rear", + fixture="synth-age-13mo", + sector="rear", + emissions=emissions, + rejections=[_rejection()], + ) + # Assert + assert report.passes + assert report.signal_a_holds + + +def test_signal_a_stale_rejected_but_one_anchored_fails() -> None: + # Arrange — even with stale rejection, an anchored emission is a fatal fail + emissions = [_emission(label="satellite_anchored"), _emission(label="visual_propagated")] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=emissions, + rejections=[_rejection()], + ) + # Assert + assert not report.passes + assert report.anchored_count == 1 + assert not report.signal_a_holds # short-circuit due to anchored emission + + +# ─────────────────────── Signal B (per-frame downgrade) ─────────────────────── + + +def test_signal_b_all_downgraded_no_rejection_passes() -> None: + # Arrange — tiles loaded, every emission downgraded + emissions = [_emission(f"f_{i}", label="visual_propagated") for i in range(60)] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=emissions, + rejections=[], + ) + # Assert + assert report.passes + assert not report.signal_a_holds + assert report.signal_b_holds + + +def test_signal_b_mixed_downgrade_labels_pass() -> None: + # Arrange — visual_propagated + dead_reckoned mix + emissions = [ + _emission("f_1", label="visual_propagated"), + _emission("f_2", label="dead_reckoned"), + _emission("f_3", label="visual_propagated"), + ] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=emissions, + rejections=[], + ) + # Assert + assert report.passes + + +def test_signal_b_one_anchored_among_downgrades_fails() -> None: + # Arrange — 59 downgraded + 1 anchored = FAIL + emissions = [_emission(f"f_{i}", label="visual_propagated") for i in range(59)] + emissions.append(_emission("f_60", label="satellite_anchored")) + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=emissions, + rejections=[], + ) + # Assert + assert not report.passes + assert report.anchored_count == 1 + + +# ─────────────────────── degenerate inputs ─────────────────────── + + +def test_no_emissions_no_rejection_fails() -> None: + # Arrange — we can't conclude the freshness gate fired at all + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=[], + rejections=[], + ) + # Assert + assert not report.passes + assert not report.signal_a_holds + assert not report.signal_b_holds + + +def test_illegal_label_fails_regardless_of_signals() -> None: + # Arrange — a label outside the FT-P-03 set is a hard fail + emissions = [_emission(label="garbage")] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=emissions, + rejections=[_rejection()], + ) + # Assert + assert not report.passes + assert report.illegal_labels == (("AD000001.jpg", "garbage"),) + + +def test_non_stale_rejection_ignored() -> None: + # Arrange — reason != "stale" must not satisfy signal A + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=[], + rejections=[_rejection(reason="below_floor")], + ) + # Assert + assert not report.has_stale_rejection + assert not report.passes + + +def test_multiple_stale_rejections_collected() -> None: + # Arrange + rejections = [_rejection("t_a"), _rejection("t_b")] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-7mo", + sector="active_conflict", + emissions=[], + rejections=rejections, + ) + # Assert + assert report.stale_rejections == ("t_a", "t_b") + + +def test_report_anchored_frame_ids_preserved() -> None: + # Arrange — anchored emissions must surface their frame IDs for diagnosis + emissions = [ + _emission("AD000010.jpg", label="satellite_anchored"), + _emission("AD000020.jpg", label="satellite_anchored"), + ] + # Act + report = ate.evaluate_aged_tile_rejection( + sub_case_id="x", + fixture="synth-age-13mo", + sector="rear", + emissions=emissions, + rejections=[], + ) + # Assert + assert report.anchored_frame_ids == ("AD000010.jpg", "AD000020.jpg") + + +# ─────────────────────── project_stale_rejection_payload ─────────────────────── + + +def test_project_stale_rejection_happy_path() -> None: + # Act + r = ate.project_stale_rejection_payload({"tile_id": "t_a", "reason": "stale"}) + # Assert + assert r is not None + assert r.tile_id == "t_a" + + +def test_project_stale_rejection_id_alias_accepted() -> None: + # Act + r = ate.project_stale_rejection_payload({"id": "t_b", "reason": "stale"}) + # Assert + assert r is not None + assert r.tile_id == "t_b" + + +def test_project_stale_rejection_wrong_reason_returns_none() -> None: + # Act + r = ate.project_stale_rejection_payload({"tile_id": "t_a", "reason": "below_floor"}) + # Assert + assert r is None + + +def test_project_stale_rejection_non_dict_returns_none() -> None: + # Act / Assert + assert ate.project_stale_rejection_payload(None) is None + assert ate.project_stale_rejection_payload("nope") is None + + +def test_project_stale_rejection_missing_id_returns_none() -> None: + # Act / Assert + assert ate.project_stale_rejection_payload({"reason": "stale"}) is None + + +def test_project_stale_rejection_id_not_string_returns_none() -> None: + # Act / Assert + assert ate.project_stale_rejection_payload({"tile_id": 42, "reason": "stale"}) is None + + +# ─────────────────────── iter_stale_rejection_payloads ─────────────────────── + + +class _StubRecord: + def __init__(self, record_type: str, payload: Any) -> None: + self.record_type = record_type + self.payload = payload + + +def test_iter_stale_rejection_payloads_filters_record_type() -> None: + # Arrange + records = [ + _StubRecord("tile-load-rejected", {"tile_id": "t_a", "reason": "stale"}), + _StubRecord("retrieval-topk", {"image_id": "AD000001"}), + _StubRecord("tile-load-rejected", {"tile_id": "t_b", "reason": "below_floor"}), + _StubRecord("tile-load-rejected", {"tile_id": "t_c", "reason": "stale"}), + ] + # Act + rejections = list(ate.iter_stale_rejection_payloads(records)) + # Assert + assert [r.tile_id for r in rejections] == ["t_a", "t_c"] + + +# ─────────────────────── AGED_FIXTURE_SECTOR_BINDINGS ─────────────────────── + + +def test_aged_fixture_sector_bindings_match_spec() -> None: + # Assert — frozen contract from FT-N-05 AC-8.2 sub-case table + assert dict(ate.AGED_FIXTURE_SECTOR_BINDINGS) == { + "synth-age-7mo": ate.SECTOR_ACTIVE_CONFLICT, + "synth-age-13mo": ate.SECTOR_REAR, + } diff --git a/e2e/_unit_tests/helpers/test_retrieval_evaluator.py b/e2e/_unit_tests/helpers/test_retrieval_evaluator.py new file mode 100644 index 0000000..6133e1a --- /dev/null +++ b/e2e/_unit_tests/helpers/test_retrieval_evaluator.py @@ -0,0 +1,389 @@ +"""Unit tests for ``runner.helpers.retrieval_evaluator`` (AZ-423). + +Pure-logic AC-8.6 coverage for FT-P-19 — the scenario in +``e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py`` exercises the +same helpers end-to-end when the SITL fixture is prepared; this file +covers them in isolation. +""" + +from __future__ import annotations + +import csv +from pathlib import Path +from typing import Any + +import pytest + +from runner.helpers import retrieval_evaluator as re_ + +DERKACHI_LAT = 48.275292 +DERKACHI_LON = 37.385220 + + +def _candidate( + tile_id: str = "tile_001", + lat: float = DERKACHI_LAT, + lon: float = DERKACHI_LON, +) -> re_.CandidateTile: + return re_.CandidateTile(tile_id=tile_id, centre_lat_deg=lat, centre_lon_deg=lon) + + +def _query( + image_id: str = "AD000001", + *, + true_lat: float = DERKACHI_LAT, + true_lon: float = DERKACHI_LON, + candidates: tuple[re_.CandidateTile, ...] = (), +) -> re_.TopKQuery: + return re_.TopKQuery( + image_id=image_id, + true_centre_lat_deg=true_lat, + true_centre_lon_deg=true_lon, + candidates=candidates, + ) + + +# ─────────────────────── evaluate_top_k_within_distance ─────────────────────── + + +def test_evaluate_top_k_one_candidate_close_passes() -> None: + # Arrange — single candidate exactly at GT + q = _query(candidates=(_candidate(),)) + # Act + report = re_.evaluate_top_k_within_distance([q], expected_image_count=1) + # Assert + assert report.passes + assert report.pass_count == 1 + assert report.entries[0].min_distance_m == pytest.approx(0.0, abs=0.01) + + +def test_evaluate_top_k_all_candidates_far_fails() -> None: + # Arrange — candidate ~ 1 km east at this latitude + far = _candidate(tile_id="far", lat=DERKACHI_LAT, lon=DERKACHI_LON + 0.0135) + q = _query(candidates=(far,)) + # Act + report = re_.evaluate_top_k_within_distance([q], expected_image_count=1) + # Assert + assert not report.passes + assert report.entries[0].pass_distance is False + assert (report.entries[0].min_distance_m or 0) > 100.0 + + +def test_evaluate_top_k_one_close_candidate_among_far_passes() -> None: + # Arrange — 9 far + 1 close in top-K (any rank passes) + far_tiles = tuple( + _candidate(tile_id=f"far_{i}", lon=DERKACHI_LON + 0.01 * (i + 1)) + for i in range(9) + ) + close = _candidate(tile_id="close", lat=DERKACHI_LAT, lon=DERKACHI_LON) + q = _query(candidates=far_tiles + (close,)) + # Act + report = re_.evaluate_top_k_within_distance([q], expected_image_count=1) + # Assert + assert report.passes + + +def test_evaluate_top_k_empty_candidates_fails() -> None: + # Arrange + q = _query(candidates=()) + # Act + report = re_.evaluate_top_k_within_distance([q], expected_image_count=1) + # Assert + assert not report.passes + assert report.entries[0].min_distance_m is None + assert report.entries[0].candidate_count == 0 + + +def test_evaluate_top_k_short_query_count_fails_aggregate() -> None: + # Arrange — 1 passing query but expected_image_count=60 + q = _query(candidates=(_candidate(),)) + # Act + report = re_.evaluate_top_k_within_distance([q], expected_image_count=60) + # Assert + assert not report.passes + assert report.pass_count == 1 + + +def test_evaluate_top_k_invalid_tolerance_raises() -> None: + with pytest.raises(ValueError, match="max_distance_m"): + re_.evaluate_top_k_within_distance( + [_query(candidates=(_candidate(),))], max_distance_m=0 + ) + + +def test_evaluate_top_k_custom_tolerance() -> None: + # Arrange — candidate 200m east; default 100m fails, custom 250m passes + far_200m = _candidate(lat=DERKACHI_LAT, lon=DERKACHI_LON + 0.0027) + q = _query(candidates=(far_200m,)) + # Act + strict = re_.evaluate_top_k_within_distance([q], max_distance_m=100, expected_image_count=1) + lenient = re_.evaluate_top_k_within_distance([q], max_distance_m=250, expected_image_count=1) + # Assert + assert not strict.passes + assert lenient.passes + + +def test_evaluate_top_k_aggregate_60_all_pass() -> None: + # Arrange — 60 queries, each with one close candidate + queries = [_query(image_id=f"AD0000{i:02d}", candidates=(_candidate(),)) for i in range(1, 61)] + # Act + report = re_.evaluate_top_k_within_distance(queries, expected_image_count=60) + # Assert + assert report.passes + assert report.pass_count == 60 + + +def test_evaluate_top_k_aggregate_60_one_fail() -> None: + # Arrange — 59 pass + 1 fail + queries = [_query(image_id=f"AD0000{i:02d}", candidates=(_candidate(),)) for i in range(1, 60)] + queries.append(_query(image_id="AD000060", candidates=())) + # Act + report = re_.evaluate_top_k_within_distance(queries, expected_image_count=60) + # Assert + assert not report.passes + assert report.pass_count == 59 + assert len(report.failing_entries) == 1 + + +# ─────────────────────── evaluate_scene_change_subset ─────────────────────── + + +def test_evaluate_scene_change_both_matched_still_partial() -> None: + # Arrange — both pairs matched + matches = [ + re_.SceneChangeMatch(image_id="AD000001", matched=True, inlier_count=120), + re_.SceneChangeMatch(image_id="AD000002", matched=True, inlier_count=98), + ] + # Act + report = re_.evaluate_scene_change_subset(matches) + # Assert + assert report.coverage_complete + assert report.overall_label == re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL + assert report.matched_count == 2 + + +def test_evaluate_scene_change_zero_matched_still_partial() -> None: + # Arrange + matches = [ + re_.SceneChangeMatch(image_id="AD000001", matched=False, inlier_count=0), + re_.SceneChangeMatch(image_id="AD000002", matched=False, inlier_count=0), + ] + # Act + report = re_.evaluate_scene_change_subset(matches) + # Assert + assert report.overall_label == re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL + + +def test_evaluate_scene_change_one_image_only_coverage_incomplete() -> None: + # Arrange + matches = [re_.SceneChangeMatch(image_id="AD000001", matched=True, inlier_count=120)] + # Act + report = re_.evaluate_scene_change_subset(matches) + # Assert + assert not report.coverage_complete + # PARTIAL label still set (decoupled from coverage) + assert report.overall_label == re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL + + +def test_evaluate_scene_change_empty_coverage_incomplete() -> None: + # Act + report = re_.evaluate_scene_change_subset([]) + # Assert + assert not report.coverage_complete + + +def test_evaluate_scene_change_extra_image_ids_coverage_incomplete() -> None: + # Arrange — image set that doesn't match expected pair ids + matches = [ + re_.SceneChangeMatch(image_id="AD000099", matched=True, inlier_count=120), + re_.SceneChangeMatch(image_id="AD000002", matched=True, inlier_count=80), + ] + # Act + report = re_.evaluate_scene_change_subset(matches) + # Assert + assert not report.coverage_complete + + +# ─────────────────────── CSV writers ─────────────────────── + + +def test_write_top_k_csv_round_trip(tmp_path: Path) -> None: + # Arrange + out = tmp_path / "topk.csv" + queries = [ + _query(image_id="AD000001", candidates=(_candidate(),)), + _query(image_id="AD000002", candidates=()), + ] + report = re_.evaluate_top_k_within_distance(queries, expected_image_count=2) + # Act + re_.write_top_k_csv(out, report) + rows = list(csv.reader(out.open(encoding="utf-8"))) + # Assert + assert rows[0] == list(re_.TOP_K_CSV_HEADER) + assert rows[1][0] == "AD000001" + assert rows[1][3] == "true" + assert rows[2][0] == "AD000002" + assert rows[2][2] == "" # min_distance_m is None when no candidates + assert rows[2][3] == "false" + + +def test_write_scene_change_csv_round_trip(tmp_path: Path) -> None: + # Arrange + out = tmp_path / "scene_change.csv" + matches = [ + re_.SceneChangeMatch(image_id="AD000001", matched=True, inlier_count=120), + re_.SceneChangeMatch(image_id="AD000002", matched=False, inlier_count=None), + ] + report = re_.evaluate_scene_change_subset(matches) + # Act + re_.write_scene_change_csv(out, report) + rows = list(csv.reader(out.open(encoding="utf-8"))) + # Assert + assert rows[0] == list(re_.SCENE_CHANGE_CSV_HEADER) + assert rows[1] == ["AD000001", "true", "120", re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL] + assert rows[2] == ["AD000002", "false", "", re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL] + + +def test_write_top_k_csv_missing_parent_dir_raises(tmp_path: Path) -> None: + out = tmp_path / "nope" / "topk.csv" + report = re_.evaluate_top_k_within_distance( + [_query(candidates=(_candidate(),))], expected_image_count=1 + ) + with pytest.raises(OSError): + re_.write_top_k_csv(out, report) + + +# ─────────────────────── project_topk_record_to_query ─────────────────────── + + +def test_project_topk_happy_path() -> None: + # Arrange + payload = { + "image_id": "AD000001", + "candidates": [ + {"tile_id": "t_a", "centre_lat_deg": DERKACHI_LAT, "centre_lon_deg": DERKACHI_LON}, + {"tile_id": "t_b", "centre_lat_deg": DERKACHI_LAT + 0.001, "centre_lon_deg": DERKACHI_LON}, + ], + } + # Act + q = re_.project_topk_record_to_query( + payload, true_centre_lat_deg=DERKACHI_LAT, true_centre_lon_deg=DERKACHI_LON + ) + # Assert + assert q is not None + assert q.image_id == "AD000001" + assert len(q.candidates) == 2 + assert q.candidates[0].tile_id == "t_a" + + +def test_project_topk_skips_malformed_candidates() -> None: + # Arrange + payload = { + "image_id": "AD000002", + "candidates": [ + {"tile_id": "ok", "centre_lat_deg": DERKACHI_LAT, "centre_lon_deg": DERKACHI_LON}, + "not a dict", + {"tile_id": "missing_lat", "centre_lon_deg": DERKACHI_LON}, + {"tile_id": "wrong_type", "centre_lat_deg": "stringy", "centre_lon_deg": DERKACHI_LON}, + ], + } + # Act + q = re_.project_topk_record_to_query( + payload, true_centre_lat_deg=DERKACHI_LAT, true_centre_lon_deg=DERKACHI_LON + ) + # Assert + assert q is not None + assert [c.tile_id for c in q.candidates] == ["ok"] + + +def test_project_topk_non_dict_payload_returns_none() -> None: + # Act / Assert + assert re_.project_topk_record_to_query("not a dict", 0, 0) is None # type: ignore[arg-type] + + +def test_project_topk_missing_image_id_returns_none() -> None: + # Act / Assert + assert re_.project_topk_record_to_query({"candidates": []}, 0, 0) is None + + +def test_project_topk_missing_candidates_returns_none() -> None: + # Act / Assert + assert re_.project_topk_record_to_query({"image_id": "AD000001"}, 0, 0) is None + + +# ─────────────────────── project_scene_change_record ─────────────────────── + + +def test_project_scene_change_happy_path() -> None: + # Arrange + payload = {"image_id": "AD000001", "matched": True, "inlier_count": 120} + # Act + m = re_.project_scene_change_record(payload) + # Assert + assert m is not None + assert m.matched is True + assert m.inlier_count == 120 + + +def test_project_scene_change_inlier_count_missing_is_none() -> None: + # Act + m = re_.project_scene_change_record({"image_id": "AD000001", "matched": False}) + # Assert + assert m is not None + assert m.inlier_count is None + + +def test_project_scene_change_inlier_count_bool_is_none() -> None: + # Act — bool is technically int in Python, but treat as missing for inlier_count + m = re_.project_scene_change_record( + {"image_id": "AD000001", "matched": True, "inlier_count": True} + ) + # Assert + assert m is not None + assert m.inlier_count is None + + +def test_project_scene_change_matched_not_bool_returns_none() -> None: + # Act / Assert + assert re_.project_scene_change_record({"image_id": "AD000001", "matched": "yes"}) is None + + +def test_project_scene_change_non_dict_returns_none() -> None: + # Act / Assert + assert re_.project_scene_change_record(None) is None + assert re_.project_scene_change_record("nope") is None + + +# ─────────────────────── iter_*_payloads ─────────────────────── + + +class _StubRecord: + def __init__(self, record_type: str, payload: Any) -> None: + self.record_type = record_type + self.payload = payload + + +def test_iter_topk_payloads_filters_by_record_type() -> None: + # Arrange + records = [ + _StubRecord("retrieval-topk", {"image_id": "AD000001"}), + _StubRecord("scene-change-match", {"image_id": "AD000002"}), + _StubRecord("retrieval-topk", {"image_id": "AD000003"}), + _StubRecord("other-kind", {}), + ] + # Act + payloads = list(re_.iter_topk_payloads(records)) + # Assert + assert [p["image_id"] for p in payloads] == ["AD000001", "AD000003"] + + +def test_iter_scene_change_payloads_filters_by_record_type() -> None: + # Arrange + records = [ + _StubRecord("retrieval-topk", {"image_id": "AD000001"}), + _StubRecord("scene-change-match", {"image_id": "AD000002"}), + ] + # Act + payloads = list(re_.iter_scene_change_payloads(records)) + # Assert + assert payloads == [{"image_id": "AD000002"}] diff --git a/e2e/_unit_tests/test_directory_layout.py b/e2e/_unit_tests/test_directory_layout.py index b146aba..bd61a04 100644 --- a/e2e/_unit_tests/test_directory_layout.py +++ b/e2e/_unit_tests/test_directory_layout.py @@ -55,6 +55,8 @@ E2E_ROOT = Path(__file__).resolve().parents[1] "runner/helpers/tile_cache_inspector.py", "runner/helpers/mid_flight_tile_evaluator.py", "runner/helpers/mock_suite_sat_audit.py", + "runner/helpers/retrieval_evaluator.py", + "runner/helpers/aged_tile_rejection_evaluator.py", "runner/helpers/cold_start_evaluator.py", "runner/helpers/outlier_tolerance_evaluator.py", "runner/helpers/outage_request_evaluator.py", @@ -116,10 +118,12 @@ E2E_ROOT = Path(__file__).resolve().parents[1] "tests/positive/test_ft_p_16_offline_only.py", "tests/positive/test_ft_p_17_mid_flight_tiles.py", "tests/positive/test_ft_p_18_no_raw_retention.py", + "tests/positive/test_ft_p_19_sat_reloc_scale.py", "tests/negative/test_ft_n_01_outlier_tolerance.py", "tests/negative/test_ft_n_02_sharp_turn_failure.py", "tests/negative/test_ft_n_03_outage_reloc.py", "tests/negative/test_ft_n_04_blackout_spoof.py", + "tests/negative/test_ft_n_05_stale_tile_rejection.py", "tests/negative/test_ft_n_06_mid_flight_freshness.py", ], ) diff --git a/e2e/runner/helpers/aged_tile_rejection_evaluator.py b/e2e/runner/helpers/aged_tile_rejection_evaluator.py new file mode 100644 index 0000000..91762d6 --- /dev/null +++ b/e2e/runner/helpers/aged_tile_rejection_evaluator.py @@ -0,0 +1,218 @@ +"""Aged-tile rejection evaluator (AZ-427 / FT-N-05). + +Pure-logic helpers for AC-8.2 / AC-NEW-6 negative scenario: when the +SUT mounts a tile cache whose manifest dates exceed the freshness +window for the configured sector, NO outbound emission may carry +``source_label = satellite_anchored``. + +The AC accepts two distinct evidence shapes (per the task spec): + +* **Signal A — load-time rejection**: an FDR ``tile-load-rejected`` + record with ``reason == "stale"`` was emitted at startup for the + stale tile(s). Conceptually the SUT refused to load aged tiles at + all, so no later emission could be anchored on them. + +* **Signal B — per-frame downgrade**: the SUT loaded the tiles but + the freshness gate downgrades every outbound emission to + ``source_label ∈ {visual_propagated, dead_reckoned}``. + +A run passes iff at least one signal holds AND no emission slipped +through with ``satellite_anchored``. A run fails the moment ANY frame +carries ``satellite_anchored`` regardless of which signal otherwise +held. + +The scenario test pulls emissions from the SITL observer / FDR stream +and feeds a typed list to ``evaluate_aged_tile_rejection``; this +module decides whether the parsed inputs satisfy the AC. + +Public-boundary discipline: NO imports from ``src/gps_denied_onboard``. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterable, Sequence + +from .estimate_schema import ALLOWED_SOURCE_LABELS + +# ─────────────────────── FDR record kinds & constants ─────────────────────── + +TILE_LOAD_REJECTED_FDR_KIND = "tile-load-rejected" +TILE_LOAD_REJECTED_STALE_REASON = "stale" + +SATELLITE_ANCHORED_LABEL = "satellite_anchored" +ACCEPTABLE_DOWNGRADE_LABELS: frozenset[str] = frozenset( + {"visual_propagated", "dead_reckoned"} +) + +# Sector configuration values per AC-8.2 — surface them here so the +# scenario test can declare its sub-case configuration in the same +# vocabulary as the AC. +SECTOR_ACTIVE_CONFLICT = "active_conflict" +SECTOR_REAR = "rear" + +# Sub-case bindings: synth-age fixture name → sector value the SUT must +# be configured for, per AC-8.2 of the AZ-427 spec. Used by the scenario +# test to declare its parameterisation in spec vocabulary. +AGED_FIXTURE_SECTOR_BINDINGS: tuple[tuple[str, str], ...] = ( + ("synth-age-7mo", SECTOR_ACTIVE_CONFLICT), + ("synth-age-13mo", SECTOR_REAR), +) + + +@dataclass(frozen=True) +class SourceLabelEmission: + """One outbound estimate's source-label observation. + + ``frame_id`` is whatever identifier the scenario test attaches to + the per-frame emission (e.g., still-image filename or per-frame + monotonic counter). It is opaque to this helper — only the label + and its membership in the allowed sets matter for AC evaluation. + + ``label`` is the SUT's emitted ``source_label`` value; the helper + validates it against ``ALLOWED_SOURCE_LABELS`` from + ``estimate_schema``. + """ + + frame_id: str + label: str + + +@dataclass(frozen=True) +class StaleTileRejection: + """One FDR ``tile-load-rejected`` event with ``reason == "stale"``.""" + + tile_id: str + reason: str # always ``"stale"`` for rejections of interest + + +@dataclass(frozen=True) +class AgedTileRejectionReport: + """AC-1 / AC-2 of FT-N-05: aged tiles must never produce satellite_anchored. + + Decision matrix: + + * ``has_stale_rejection`` True AND ``anchored_count`` == 0 → PASS via Signal A. + * ``has_stale_rejection`` False AND every frame in the downgrade set → PASS via Signal B. + * Any frame with ``label == "satellite_anchored"`` → FAIL, period. + * No emissions observed AND no stale rejection → FAIL (we can't + conclude the freshness gate fired at all). + * Any frame with a label outside ``ALLOWED_SOURCE_LABELS`` → FAIL + (this is itself a contract violation per FT-P-03 / AC-1.4 — we + surface it loudly via ``illegal_labels``). + """ + + sub_case_id: str + fixture: str + sector: str + emissions_observed: int + anchored_frame_ids: tuple[str, ...] + downgrade_frame_count: int + illegal_labels: tuple[tuple[str, str], ...] # (frame_id, label) + stale_rejections: tuple[str, ...] # tile_ids + + @property + def has_stale_rejection(self) -> bool: + return bool(self.stale_rejections) + + @property + def anchored_count(self) -> int: + return len(self.anchored_frame_ids) + + @property + def signal_a_holds(self) -> bool: + """Signal A: SUT rejected the aged tiles at load time.""" + return self.has_stale_rejection and self.anchored_count == 0 + + @property + def signal_b_holds(self) -> bool: + """Signal B: SUT loaded but downgrades every emission. + + Requires at least one observation AND every observed emission + falls in the downgrade set (and zero anchored emissions). + """ + return ( + self.emissions_observed > 0 + and self.downgrade_frame_count == self.emissions_observed + and self.anchored_count == 0 + ) + + @property + def passes(self) -> bool: + if self.illegal_labels: + return False + if self.anchored_count > 0: + return False + return self.signal_a_holds or self.signal_b_holds + + +def evaluate_aged_tile_rejection( + sub_case_id: str, + fixture: str, + sector: str, + emissions: Sequence[SourceLabelEmission], + rejections: Iterable[StaleTileRejection], +) -> AgedTileRejectionReport: + """AC-1 / AC-2: decide a single sub-case. + + ``sub_case_id`` is the human-readable label the test attaches to + this sub-case for traceability (e.g., "7mo_active_conflict"). + """ + anchored: list[str] = [] + downgrade: int = 0 + illegal: list[tuple[str, str]] = [] + for em in emissions: + if em.label not in ALLOWED_SOURCE_LABELS: + illegal.append((em.frame_id, em.label)) + continue + if em.label == SATELLITE_ANCHORED_LABEL: + anchored.append(em.frame_id) + elif em.label in ACCEPTABLE_DOWNGRADE_LABELS: + downgrade += 1 + stale_tile_ids = tuple(r.tile_id for r in rejections if r.reason == TILE_LOAD_REJECTED_STALE_REASON) + return AgedTileRejectionReport( + sub_case_id=sub_case_id, + fixture=fixture, + sector=sector, + emissions_observed=len(emissions), + anchored_frame_ids=tuple(anchored), + downgrade_frame_count=downgrade, + illegal_labels=tuple(illegal), + stale_rejections=stale_tile_ids, + ) + + +# ─────────────────────── FDR record projection ─────────────────────── + + +def project_stale_rejection_payload(payload: object) -> StaleTileRejection | None: + """Project one FDR ``tile-load-rejected`` payload onto a ``StaleTileRejection``. + + The payload is expected to carry: + + * ``tile_id`` or ``id`` (str) + * ``reason`` (str) — only ``"stale"`` is relevant for this AC + + Returns ``None`` when malformed OR when the reason is not stale — + AC-NEW-6 cares only about the stale-rejection subset. + """ + if not isinstance(payload, dict): + return None + reason = payload.get("reason") + if reason != TILE_LOAD_REJECTED_STALE_REASON: + return None + raw_id = payload.get("tile_id") or payload.get("id") + if not isinstance(raw_id, str): + return None + return StaleTileRejection(tile_id=raw_id, reason=reason) + + +def iter_stale_rejection_payloads(records: Iterable[object]) -> Iterable[StaleTileRejection]: + """Filter records → yield well-formed stale ``StaleTileRejection`` events.""" + for rec in records: + rt = getattr(rec, "record_type", None) + if rt != TILE_LOAD_REJECTED_FDR_KIND: + continue + rejection = project_stale_rejection_payload(getattr(rec, "payload", None)) + if rejection is not None: + yield rejection diff --git a/e2e/runner/helpers/retrieval_evaluator.py b/e2e/runner/helpers/retrieval_evaluator.py new file mode 100644 index 0000000..1b68a22 --- /dev/null +++ b/e2e/runner/helpers/retrieval_evaluator.py @@ -0,0 +1,380 @@ +"""Top-K retrieval evaluator + scene-change PARTIAL recorder (AZ-423 / FT-P-19). + +Two pure-logic helpers feeding AC-8.6 (AZ-423): + +* **AC-1 — scale-ratio retrievability**: for each of the 60 + ``still-image-set-60`` images the SUT runs top-K=10 retrieval against + its tile cache. The AC passes iff EVERY image has at least one + retrieved tile whose centre lies within 100 m of the image's true + centre. This is a "set_contains" check — we do NOT care which rank + the matching tile occupies, only that it appears in the top-K. + +* **AC-2 — scene-change subset PARTIAL**: for the 2 paired + ``_gmaps.png`` reference images the cross-domain matcher runs; the + helper records the boolean outcome AND tags the subset's overall + result as ``PARTIAL`` unconditionally — because N=2 is too small to + yield a meaningful pass/fail statistic, and the traceability matrix + documents this AC as PARTIAL irrespective of count. + +The scenario test pulls per-frame retrieval candidates from the FDR +``retrieval-topk`` record stream and per-image scene-change outcomes +from the cross-domain matcher's FDR record stream; this module only +decides whether the parsed inputs satisfy the AC. + +Public-boundary discipline: NO imports from ``src/gps_denied_onboard``. +""" + +from __future__ import annotations + +import csv +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, Sequence + +from .geo import distance_m + +# ─────────────────────── FDR record kinds & schema ─────────────────────── + +RETRIEVAL_TOPK_FDR_KIND = "retrieval-topk" +SCENE_CHANGE_MATCH_FDR_KIND = "scene-change-match" + +TOP_K_REQUIRED = 10 +TOP_K_DISTANCE_TOLERANCE_M = 100.0 + +# Scene-change-pair convention: image `AD.jpg` paired with +# `AD_gmaps.png`. Only 2 pairs exist in the project's static +# fixture set (`still-image-sat-refs-2`). +SCENE_CHANGE_PAIRED_IMAGE_IDS: tuple[str, ...] = ("AD000001", "AD000002") +SCENE_CHANGE_SUBSET_PARTIAL_LABEL = "PARTIAL" + + +@dataclass(frozen=True) +class CandidateTile: + """One top-K candidate the SUT retrieved from its tile cache. + + ``centre_lat_deg`` / ``centre_lon_deg`` is the WGS84 centre of the + tile's footprint per ``TileMetadata.centre_wgs84``. + """ + + tile_id: str + centre_lat_deg: float + centre_lon_deg: float + + +@dataclass(frozen=True) +class TopKQuery: + """One image's top-K=10 retrieval result, plus the GT centre. + + The scenario test produces one of these per image; the helper + decides AC-1 per-image and overall. + """ + + image_id: str + true_centre_lat_deg: float + true_centre_lon_deg: float + candidates: tuple[CandidateTile, ...] + + +@dataclass(frozen=True) +class TopKImageReport: + """Per-image AC-1 outcome. + + ``min_distance_m`` is the nearest candidate's distance (m) to the + true centre; ``None`` when ``candidates`` is empty (the helper + treats that as failure). + """ + + image_id: str + candidate_count: int + min_distance_m: float | None + pass_distance: bool + + +@dataclass(frozen=True) +class TopKAggregateReport: + """AC-1 of FT-P-19: every image's top-K covers the true centre.""" + + entries: tuple[TopKImageReport, ...] + max_distance_m: float + expected_image_count: int + + @property + def pass_count(self) -> int: + return sum(1 for e in self.entries if e.pass_distance) + + @property + def failing_entries(self) -> tuple[TopKImageReport, ...]: + return tuple(e for e in self.entries if not e.pass_distance) + + @property + def passes(self) -> bool: + if len(self.entries) != self.expected_image_count: + return False + return self.pass_count == self.expected_image_count + + +def evaluate_top_k_within_distance( + queries: Sequence[TopKQuery], + *, + max_distance_m: float = TOP_K_DISTANCE_TOLERANCE_M, + expected_image_count: int = 60, +) -> TopKAggregateReport: + """AC-1: every image's top-K must include a tile within ``max_distance_m`` m. + + The helper computes the nearest candidate's Vincenty distance to + the true centre for each query and decides per-image. The image + passes iff at least one candidate is within ``max_distance_m``; + the aggregate passes iff every image passes AND the total query + count matches ``expected_image_count``. + + Raises ``ValueError`` on ``max_distance_m <= 0``. + """ + if max_distance_m <= 0: + raise ValueError(f"max_distance_m must be > 0, got {max_distance_m}") + + entries: list[TopKImageReport] = [] + for q in queries: + if not q.candidates: + entries.append( + TopKImageReport( + image_id=q.image_id, + candidate_count=0, + min_distance_m=None, + pass_distance=False, + ) + ) + continue + distances = [ + distance_m( + q.true_centre_lat_deg, + q.true_centre_lon_deg, + c.centre_lat_deg, + c.centre_lon_deg, + ) + for c in q.candidates + ] + min_d = min(distances) + entries.append( + TopKImageReport( + image_id=q.image_id, + candidate_count=len(q.candidates), + min_distance_m=min_d, + pass_distance=min_d <= max_distance_m, + ) + ) + return TopKAggregateReport( + entries=tuple(entries), + max_distance_m=max_distance_m, + expected_image_count=expected_image_count, + ) + + +# ─────────────────────── AC-2 scene-change subset ─────────────────────── + + +@dataclass(frozen=True) +class SceneChangeMatch: + """One paired-image cross-domain matcher outcome.""" + + image_id: str # e.g. "AD000001"; pairs implicitly with `_gmaps.png` + matched: bool + inlier_count: int | None # informational; ``None`` when the matcher didn't report it + + +@dataclass(frozen=True) +class SceneChangeSubsetReport: + """AC-2 of FT-P-19: scene-change subset is structurally PARTIAL. + + The subset's overall_label is ALWAYS ``PARTIAL`` — even when both + images match successfully — because N=2 is too small for a + meaningful pass/fail statistic and the traceability matrix + documents AC-8.6 as PARTIAL irrespective of outcome count. + """ + + entries: tuple[SceneChangeMatch, ...] + expected_image_ids: tuple[str, ...] = SCENE_CHANGE_PAIRED_IMAGE_IDS + overall_label: str = SCENE_CHANGE_SUBSET_PARTIAL_LABEL + + @property + def matched_count(self) -> int: + return sum(1 for e in self.entries if e.matched) + + @property + def coverage_complete(self) -> bool: + """True iff every expected paired-image id has an entry. + + Coverage is a structural completeness check (did we collect + results for both AD000001 and AD000002?); it is independent + of the matcher pass/fail outcome. + """ + observed = {e.image_id for e in self.entries} + return observed == set(self.expected_image_ids) + + +def evaluate_scene_change_subset( + matches: Sequence[SceneChangeMatch], + *, + expected_image_ids: Sequence[str] = SCENE_CHANGE_PAIRED_IMAGE_IDS, +) -> SceneChangeSubsetReport: + """AC-2: record the paired-image matcher outcomes and emit PARTIAL. + + The result is intentionally lenient on pass/fail count — the PARTIAL + annotation comes from the spec, not from the data. + """ + return SceneChangeSubsetReport( + entries=tuple(matches), + expected_image_ids=tuple(expected_image_ids), + ) + + +# ─────────────────────── CSV evidence emission ─────────────────────── + +TOP_K_CSV_HEADER: tuple[str, ...] = ( + "image_id", + "candidate_count", + "min_distance_m", + "pass_distance", +) +SCENE_CHANGE_CSV_HEADER: tuple[str, ...] = ( + "image_id", + "matched", + "inlier_count", + "subset_label", +) + + +def write_top_k_csv(path: Path, report: TopKAggregateReport) -> None: + """Write per-image AC-1 results to ``path`` (CSV, UTF-8, LF newlines). + + Idempotent: overwrites ``path`` if it exists. + Raises ``OSError`` if the parent directory does not exist. + """ + with path.open("w", encoding="utf-8", newline="") as fh: + writer = csv.writer(fh, lineterminator="\n") + writer.writerow(TOP_K_CSV_HEADER) + for e in report.entries: + writer.writerow( + [ + e.image_id, + e.candidate_count, + "" if e.min_distance_m is None else f"{e.min_distance_m:.4f}", + "true" if e.pass_distance else "false", + ] + ) + + +def write_scene_change_csv(path: Path, report: SceneChangeSubsetReport) -> None: + """Write paired-image AC-2 results to ``path`` (CSV) with PARTIAL tag. + + Every row carries the subset's ``overall_label`` so a downstream + consumer can group / filter by it without joining tables. + """ + with path.open("w", encoding="utf-8", newline="") as fh: + writer = csv.writer(fh, lineterminator="\n") + writer.writerow(SCENE_CHANGE_CSV_HEADER) + for e in report.entries: + writer.writerow( + [ + e.image_id, + "true" if e.matched else "false", + "" if e.inlier_count is None else e.inlier_count, + report.overall_label, + ] + ) + + +# ─────────────────────── FDR record projection ─────────────────────── + + +def project_topk_record_to_query( + payload: object, true_centre_lat_deg: float, true_centre_lon_deg: float +) -> TopKQuery | None: + """Project one FDR ``retrieval-topk`` payload onto a ``TopKQuery``. + + The payload is expected to carry: + + * ``image_id`` (str) + * ``candidates`` — list[dict] of {tile_id, centre_lat_deg, centre_lon_deg} + + Returns ``None`` when the payload is malformed (missing fields, wrong + shape) — the scenario logs the skip and treats it as failure at + aggregate time. + """ + if not isinstance(payload, dict): + return None + image_id = payload.get("image_id") + raw_candidates = payload.get("candidates") + if not isinstance(image_id, str) or not isinstance(raw_candidates, (list, tuple)): + return None + candidates: list[CandidateTile] = [] + for c in raw_candidates: + if not isinstance(c, dict): + continue + tile_id = c.get("tile_id") + lat = c.get("centre_lat_deg") + lon = c.get("centre_lon_deg") + if ( + not isinstance(tile_id, str) + or not isinstance(lat, (int, float)) + or not isinstance(lon, (int, float)) + ): + continue + candidates.append( + CandidateTile( + tile_id=tile_id, + centre_lat_deg=float(lat), + centre_lon_deg=float(lon), + ) + ) + return TopKQuery( + image_id=image_id, + true_centre_lat_deg=true_centre_lat_deg, + true_centre_lon_deg=true_centre_lon_deg, + candidates=tuple(candidates), + ) + + +def project_scene_change_record(payload: object) -> SceneChangeMatch | None: + """Project one FDR ``scene-change-match`` payload onto a ``SceneChangeMatch``. + + Expected payload shape: + + * ``image_id`` (str) + * ``matched`` (bool) + * ``inlier_count`` (int | None) — optional + + Returns ``None`` on malformed input. + """ + if not isinstance(payload, dict): + return None + image_id = payload.get("image_id") + matched = payload.get("matched") + if not isinstance(image_id, str) or not isinstance(matched, bool): + return None + inlier_raw = payload.get("inlier_count") + inlier_count: int | None + if inlier_raw is None: + inlier_count = None + elif isinstance(inlier_raw, bool) or not isinstance(inlier_raw, int): + inlier_count = None + else: + inlier_count = inlier_raw + return SceneChangeMatch( + image_id=image_id, matched=matched, inlier_count=inlier_count + ) + + +def iter_topk_payloads(records: Iterable[object]) -> Iterable[object]: + """Filter an iterable of FDR records, yielding ``retrieval-topk`` payloads.""" + for rec in records: + rt = getattr(rec, "record_type", None) + if rt == RETRIEVAL_TOPK_FDR_KIND: + yield getattr(rec, "payload", None) + + +def iter_scene_change_payloads(records: Iterable[object]) -> Iterable[object]: + """Filter an iterable of FDR records, yielding ``scene-change-match`` payloads.""" + for rec in records: + rt = getattr(rec, "record_type", None) + if rt == SCENE_CHANGE_MATCH_FDR_KIND: + yield getattr(rec, "payload", None) diff --git a/e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py b/e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py new file mode 100644 index 0000000..be6bb01 --- /dev/null +++ b/e2e/tests/negative/test_ft_n_05_stale_tile_rejection.py @@ -0,0 +1,168 @@ +"""FT-N-05 — Stale-tile rejection on freshness violation (AZ-427 / AC-8.2, AC-NEW-6). + +Two sub-cases: + +* ``synth-age-7mo`` mounted, SUT configured for active-conflict sector. +* ``synth-age-13mo`` mounted, SUT configured for rear sector. + +For each sub-case: push the 60 still images, then assert one of: + +1. **Signal A** — the FDR contains at least one ``tile-load-rejected`` + record with ``reason == "stale"`` AND no outbound emission carried + ``source_label = satellite_anchored``. +2. **Signal B** — the SUT loaded the cache but every emission falls in + ``{visual_propagated, dead_reckoned}``. + +Either signal is acceptable; the AC fails the moment any frame slips +through with ``satellite_anchored`` from these aged tiles. + +What this file owns: + +* The AC-1 / AC-2 / AC-3 wiring for both sub-cases. +* Sector-binding declaration via ``AGED_FIXTURE_SECTOR_BINDINGS``. + +What this file does NOT own: + +* The frame-source push → ``runner.helpers.frame_source_replay`` (stub). +* The SITL message receipt → ``runner.helpers.sitl_observer`` (stub). +* The fixture-builder support for mounting ``synth-age-*`` tiles and + switching sector configuration — surfaced as a production-dependency + finding in the batch report (gated on ``sitl_replay_ready`` + + ``E2E_FT_N_05_FIXTURE`` env var; the test skips cleanly when not set + and fails loudly when set but the records are missing). +""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pytest + +from runner.helpers import accuracy_evaluator as ae +from runner.helpers import aged_tile_rejection_evaluator as ate + +FT_N_05_FIXTURE_ENV = "E2E_FT_N_05_FIXTURE" + +GT_CSV = Path(__file__).resolve().parents[3] / "_docs" / "00_problem" / "input_data" / "coordinates.csv" + + +@pytest.mark.traces_to("AC-8.2,AC-NEW-6,AC-1,AC-2,AC-3") +@pytest.mark.parametrize( + "fixture, sector", + ate.AGED_FIXTURE_SECTOR_BINDINGS, + ids=[f"{f}_{s}" for f, s in ate.AGED_FIXTURE_SECTOR_BINDINGS], +) +def test_ft_n_05_stale_tile_rejection( + fixture: str, + sector: str, + fc_adapter: str, + vio_strategy: str, + evidence_dir, # type: ignore[no-untyped-def] + run_id: str, + nfr_recorder, # type: ignore[no-untyped-def] + sitl_replay_ready: bool, +) -> None: + """Full FT-N-05 sub-case (AC-1 or AC-2 depending on ``fixture``).""" + if not sitl_replay_ready: + pytest.skip( + "FT-N-05 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay " + "fixture mounting one of the synth-age tile sets (AZ-595 + AZ-427 " + "fixture builder). Pure-logic AC-8.2/AC-NEW-6 coverage lives in " + "e2e/_unit_tests/helpers/test_aged_tile_rejection_evaluator.py." + ) + active_fixture = os.environ.get(FT_N_05_FIXTURE_ENV) + if active_fixture is None: + pytest.skip( + f"FT-N-05 needs `{FT_N_05_FIXTURE_ENV}` env var to declare which " + f"synth-age fixture the fixture builder mounted " + f"(one of {[f for f, _ in ate.AGED_FIXTURE_SECTOR_BINDINGS]}). " + "The fixture builder publishes this from its per-sub-case orchestration." + ) + if active_fixture != fixture: + pytest.skip( + f"FT-N-05 sub-case [{fixture}/{sector}] does not match the mounted " + f"fixture (`{FT_N_05_FIXTURE_ENV}` = {active_fixture!r}). " + "Subsequent matrix sub-cases will run in their own pytest invocations." + ) + + from runner.helpers import fdr_reader, frame_source_replay, sitl_observer + from runner.helpers.frame_source_replay import FrameSourceReplayer + + image_paths = sorted(GT_CSV.parent.glob("AD??????.jpg")) + if len(image_paths) != ae.TOTAL_IMAGES_REQUIRED: + pytest.fail( + f"FT-N-05 expects {ae.TOTAL_IMAGES_REQUIRED} still images, " + f"found {len(image_paths)} under {GT_CSV.parent}" + ) + + sitl_host = "sitl-ardupilot" if fc_adapter == "ardupilot" else "sitl-inav" + observer = sitl_observer.get_observer(fc_kind=fc_adapter, host=sitl_host) + sink = _resolve_frame_sink() + replayer = FrameSourceReplayer(sink) + + emissions: list[ate.SourceLabelEmission] = [] + per_image_timeout_s = 5.0 + for path in image_paths: + image_id = path.name + replayer.replay_image(path) + try: + msg = observer.wait_for_outbound(timeout_s=per_image_timeout_s) + except TimeoutError: + continue + label = getattr(msg, "source_label", None) + if not isinstance(label, str): + pytest.fail( + f"FT-N-05: outbound emission for {image_id} carries no `source_label` " + f"(got {type(label).__name__}). The SUT must include `source_label` per " + "FT-P-03 / AC-1.4 even on downgraded frames." + ) + emissions.append(ate.SourceLabelEmission(frame_id=image_id, label=label)) + + fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr" + rejections = list(ate.iter_stale_rejection_payloads(fdr_reader.iter_records(fdr_root))) + + sub_case_id = f"{fixture}_{sector}" + report = ate.evaluate_aged_tile_rejection( + sub_case_id=sub_case_id, + fixture=fixture, + sector=sector, + emissions=emissions, + rejections=rejections, + ) + + metric_prefix = f"ft_n_05.{sub_case_id}" + nfr_recorder.record_metric( + f"{metric_prefix}.emissions_observed", float(report.emissions_observed), ac_id="AC-1" + ) + nfr_recorder.record_metric( + f"{metric_prefix}.anchored_count", float(report.anchored_count), ac_id="AC-1" + ) + nfr_recorder.record_metric( + f"{metric_prefix}.downgrade_count", float(report.downgrade_frame_count), ac_id="AC-1" + ) + nfr_recorder.record_metric( + f"{metric_prefix}.stale_rejection_count", + float(len(report.stale_rejections)), + ac_id="AC-1", + ) + + assert not report.illegal_labels, ( + "FT-N-05: outbound emissions include labels outside the FT-P-03 contract: " + f"{report.illegal_labels}. Fix the SUT before assessing the freshness gate." + ) + assert report.passes, ( + f"FT-N-05 [{sub_case_id}] failed: signal_a={report.signal_a_holds}, " + f"signal_b={report.signal_b_holds}; " + f"anchored_frames={list(report.anchored_frame_ids)[:5]}, " + f"emissions={report.emissions_observed}, " + f"downgrade_count={report.downgrade_frame_count}, " + f"stale_rejections={list(report.stale_rejections)[:5]}" + ) + + +def _resolve_frame_sink(): # type: ignore[no-untyped-def] + """Return a replay-mode `FrameSink` (counter-only; AZ-597).""" + from runner.helpers.replay_mode import NullFrameSink + + return NullFrameSink() diff --git a/e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py b/e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py new file mode 100644 index 0000000..541d075 --- /dev/null +++ b/e2e/tests/positive/test_ft_p_19_sat_reloc_scale.py @@ -0,0 +1,149 @@ +"""FT-P-19 — Satellite-relocalization scale-ratio + scene-change PARTIAL (AZ-423 / AC-8.6). + +The full scenario: + +1. Push each ``AD0000NN.jpg`` from ``still-image-set-60`` to the SUT's + frame source, one at a time. For each image, the SUT runs top-K=10 + tile-cache retrieval and writes an FDR ``retrieval-topk`` record + carrying the 10 candidate tile centres. +2. The test joins the per-image GT (``coordinates.csv``) with the FDR + stream and asserts that EVERY image's top-K covers a tile centre + within 100 m of the image's true centre (``AC-1`` / set_contains). +3. For the 2 paired ``_gmaps.png`` reference images the SUT runs the + cross-domain matcher; the test records per-image match success + into a CSV and tags the subset as ``PARTIAL`` (``AC-2``). +4. ``AC-3`` — parameterisation across ``(fc_adapter, vio_strategy)``. + +What this file owns: + +* The AC-1 / AC-2 / AC-3 wiring above. +* CSV evidence emission via the AZ-423-owned ``retrieval_evaluator``. + +What this file does NOT own: + +* The frame-source push → ``runner.helpers.frame_source_replay`` (stub, + AZ-441) — skip-gated. +* The SITL message receipt → ``runner.helpers.sitl_observer`` (stub; + AZ-416/AZ-417) — skip-gated. +* The SUT-side ``retrieval-topk`` + ``scene-change-match`` FDR record + emission — production gap, surfaced via fail-loud when the fixture + exists but the records are missing. + +When ``E2E_SITL_REPLAY_DIR`` is set and points at a prepared SITL +replay fixture, this file's runtime path activates automatically. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from runner.helpers import accuracy_evaluator as ae +from runner.helpers import retrieval_evaluator as re_ + +GT_CSV = Path(__file__).resolve().parents[3] / "_docs" / "00_problem" / "input_data" / "coordinates.csv" +STILL_IMAGES_DIR = GT_CSV.parent + + +@pytest.mark.traces_to("AC-8.6,AC-1,AC-2,AC-3") +def test_ft_p_19_sat_reloc_scale( + fc_adapter: str, + vio_strategy: str, + evidence_dir, # type: ignore[no-untyped-def] + run_id: str, + nfr_recorder, # type: ignore[no-untyped-def] + sitl_replay_ready: bool, +) -> None: + """Full FT-P-19 scenario (AC-8.6). + + AC-1: every image's top-K=10 includes a tile centre within 100 m of GT. + AC-2: paired _gmaps.png subset → PARTIAL. + AC-3: parametrised across ``(fc_adapter, vio_strategy)``. + """ + if not sitl_replay_ready: + pytest.skip( + "FT-P-19 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay " + "fixture emitting `retrieval-topk` + `scene-change-match` FDR " + "records (AZ-595 + AZ-423 fixture builder). Pure-logic AC-8.6 " + "coverage lives in e2e/_unit_tests/helpers/test_retrieval_evaluator.py." + ) + + from runner.helpers import fdr_reader + + gt_rows = ae.load_gt_coordinates(GT_CSV) + gt_by_id = {_normalise_image_id(r.image_id): r for r in gt_rows} + + fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr" + + topk_queries: list[re_.TopKQuery] = [] + for payload in re_.iter_topk_payloads(fdr_reader.iter_records(fdr_root)): + if not isinstance(payload, dict): + continue + image_id_raw = payload.get("image_id") + if not isinstance(image_id_raw, str): + continue + image_key = _normalise_image_id(image_id_raw) + gt = gt_by_id.get(image_key) + if gt is None: + continue + q = re_.project_topk_record_to_query( + payload, true_centre_lat_deg=gt.lat_deg, true_centre_lon_deg=gt.lon_deg + ) + if q is not None: + topk_queries.append(q) + + scene_change_matches: list[re_.SceneChangeMatch] = [] + for payload in re_.iter_scene_change_payloads(fdr_reader.iter_records(fdr_root)): + m = re_.project_scene_change_record(payload) + if m is not None: + scene_change_matches.append(m) + + if not topk_queries: + pytest.fail( + f"FT-P-19: no `{re_.RETRIEVAL_TOPK_FDR_KIND}` FDR records found at " + f"{fdr_root}. SUT must emit a top-K retrieval record per pushed " + "image; fixture builder must collect them." + ) + + topk_report = re_.evaluate_top_k_within_distance( + topk_queries, expected_image_count=ae.TOTAL_IMAGES_REQUIRED + ) + scene_change_report = re_.evaluate_scene_change_subset(scene_change_matches) + + top_k_csv = Path(evidence_dir) / f"ft-p-19-topk-{fc_adapter}-{vio_strategy}.csv" + scene_change_csv = Path(evidence_dir) / f"ft-p-19-scene-change-{fc_adapter}-{vio_strategy}.csv" + re_.write_top_k_csv(top_k_csv, topk_report) + re_.write_scene_change_csv(scene_change_csv, scene_change_report) + + nfr_recorder.record_metric( + "ft_p_19.topk_pass_count", float(topk_report.pass_count), ac_id="AC-1" + ) + nfr_recorder.record_metric( + "ft_p_19.topk_image_count", float(len(topk_report.entries)), ac_id="AC-1" + ) + nfr_recorder.record_metric( + "ft_p_19.scene_change_matched_count", + float(scene_change_report.matched_count), + ac_id="AC-2", + ) + + assert topk_report.passes, ( + f"AC-1 (top-K=10 within {topk_report.max_distance_m} m for {topk_report.expected_image_count} images) failed: " + f"pass_count={topk_report.pass_count}/{topk_report.expected_image_count}, " + f"failing={[(e.image_id, e.min_distance_m) for e in topk_report.failing_entries[:5]]}" + ) + assert scene_change_report.coverage_complete, ( + "AC-2 (scene-change subset coverage) failed: paired images observed = " + f"{[e.image_id for e in scene_change_report.entries]}, expected = " + f"{list(scene_change_report.expected_image_ids)}" + ) + assert scene_change_report.overall_label == re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL, ( + f"AC-2 (PARTIAL annotation): got {scene_change_report.overall_label!r}, " + f"expected {re_.SCENE_CHANGE_SUBSET_PARTIAL_LABEL!r}" + ) + + +def _normalise_image_id(image_id: str) -> str: + """Strip ``.jpg`` extension if present; ``coordinates.csv`` uses the bare stem.""" + return image_id[:-4] if image_id.lower().endswith(".jpg") else image_id