[AZ-422] Add FT-P-17 + FT-N-06 mid-flight tile blackbox tests

Implement the AC-8.4 and AC-NEW-6 blackbox scenarios for mid-flight
tile generation, dedup, landing-time upload, and freshness gating.

Helpers:
- runner/helpers/mid_flight_tile_evaluator.py — pure-logic evaluators
  for tile generation rate, Mode B Fact #105 schema check, footprint+
  GSD dedup (via geo.distance_m), upload-audit reconciliation, and
  the AC-5/AC-6 capture_utc + freshness-gate checks.
- runner/helpers/mock_suite_sat_audit.py — httpx wrapper for the
  mock-suite-sat-service /tiles/audit endpoint with strict response-
  shape validation.

Scenarios:
- tests/positive/test_ft_p_17_mid_flight_tiles.py
- tests/negative/test_ft_n_06_mid_flight_freshness.py

Both skip when sitl_replay_ready is false and fail loudly when fixture
records are missing (tests-as-gates discipline). 52 new unit tests
(41 evaluator + 11 audit client) cover every helper branch.

Review: PASS_WITH_WARNINGS (2 Low — duplicate haversine carry-over,
upstream production dependency surface).

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-17 15:28:39 +03:00
parent 1ee54b414b
commit 5def1a3eb3
11 changed files with 1782 additions and 2 deletions
@@ -0,0 +1,126 @@
"""FT-N-06 — Mid-flight tile current-timestamp + fresh-treatment (AZ-422 / AC-NEW-6).
The full scenario:
1. Same 5 min Derkachi replay as FT-P-17; the SUT generates one
FDR ``mid-flight-tile-output`` record per tile.
2. Inspect each tile's manifest entry:
* AC-5: ``|capture_utc - generated_at_monotonic_ms| ≤ 60 s``.
* AC-6: no FDR ``tile-load-rejected`` record with
``reason == "stale"`` carries any of the generated tile IDs
(a fresh tile must not be misclassified by the freshness gate).
Gated on:
* ``sitl_replay_ready`` — full replay requires the SITL fixture.
* ``runner.helpers.mid_flight_tile_evaluator`` — pure-logic
evaluator covered by
``e2e/_unit_tests/helpers/test_mid_flight_tile_evaluator.py``.
This is a "negative" test in the sense that it asserts a *non*-event:
no stale rejection of a freshly generated tile. The test still skips
cleanly when the SITL fixture is not prepared.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from runner.helpers import mid_flight_tile_evaluator as mfe
@pytest.mark.traces_to("AC-NEW-6,AC-5,AC-6,AC-7")
def test_ft_n_06_mid_flight_freshness(
fc_adapter: str,
vio_strategy: str,
evidence_dir, # type: ignore[no-untyped-def]
run_id: str,
nfr_recorder, # type: ignore[no-untyped-def]
sitl_replay_ready: bool,
) -> None:
"""Full FT-N-06 scenario (AC-NEW-6)."""
if not sitl_replay_ready:
pytest.skip(
"FT-N-06 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay "
"fixture exposing `mid-flight-tile-output` FDR records and any "
"`tile-load-rejected` events emitted by the freshness gate "
"(AZ-595 + AZ-422 fixture builder). Pure-logic AC-NEW-6 "
"coverage lives in "
"e2e/_unit_tests/helpers/test_mid_flight_tile_evaluator.py."
)
from runner.helpers import fdr_reader
fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr"
tiles: list[mfe.TileSpec] = []
rejection_payloads: list[dict] = []
for rec in fdr_reader.iter_records(fdr_root):
if rec.record_type == mfe.MID_FLIGHT_TILE_FDR_KIND:
tile = _project_tile(rec)
if tile is not None:
tiles.append(tile)
elif rec.record_type == mfe.TILE_LOAD_REJECTED_FDR_KIND:
rejection_payloads.append(dict(rec.payload))
if not tiles:
pytest.fail(
f"FT-N-06: no `{mfe.MID_FLIGHT_TILE_FDR_KIND}` FDR records at "
f"{fdr_root}. The fixture builder must produce at least one "
"generated tile for the freshness/stale check to be meaningful."
)
capture_report = mfe.evaluate_capture_date_freshness(tiles)
freshness_report = mfe.evaluate_freshness_gate(tiles, rejection_payloads)
nfr_recorder.record_metric(
"ft_n_06.tile_count", float(len(tiles)), ac_id="AC-NEW-6"
)
nfr_recorder.record_metric(
"ft_n_06.capture_drift_failures",
float(len(capture_report.failing_entries)),
ac_id="AC-5",
)
nfr_recorder.record_metric(
"ft_n_06.stale_rejection_count",
float(len(freshness_report.stale_rejections)),
ac_id="AC-6",
)
assert capture_report.passes, (
f"AC-5 (|capture_utc - generated_at| ≤ {capture_report.tolerance_s} s) failed: "
f"failures={[(e.tile_id, e.drift_s) for e in capture_report.failing_entries]}"
)
assert freshness_report.passes, (
"AC-6 (no `tile-load-rejected: stale` for freshly generated tile) failed: "
f"stale_rejected_tile_ids={freshness_report.stale_rejections}"
)
def _project_tile(rec) -> mfe.TileSpec | None: # type: ignore[no-untyped-def]
"""Project an FDR record onto a ``TileSpec``; ``None`` if malformed."""
p = rec.payload
try:
bbox = tuple(p["bbox_wgs84"]) # type: ignore[index]
except (KeyError, TypeError):
return None
if len(bbox) != 4:
return None
quality = p.get("quality") if isinstance(p.get("quality"), dict) else {}
capture_utc: str | None = None
if isinstance(quality, dict):
raw_capture = quality.get("capture_utc")
if isinstance(raw_capture, str):
capture_utc = raw_capture
return mfe.TileSpec(
tile_id=str(p.get("tile_id") or ""),
bbox_wgs84=(float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])),
zoom_level=int(p.get("zoom_level") or 0),
descriptor_sha256=str(p.get("descriptor_sha256") or ""),
payload_size_bytes=int(p.get("payload_size_bytes") or 0),
quality=dict(quality) if isinstance(quality, dict) else {},
generated_at_monotonic_ms=int(rec.monotonic_ms),
capture_utc_iso=capture_utc,
)
@@ -0,0 +1,182 @@
"""FT-P-17 — Mid-flight tile generation + landing-time upload (AZ-422 / AC-8.4).
The full scenario:
1. The SUT cold-starts against an empty ``mid-flight-tile-output/``
FDR directory + the bind-mounted Derkachi fixture.
2. Replay 5 min of Derkachi at the SUT's runtime cadence. While the
SUT generates orthorectified tiles it writes one FDR record per
tile under ``mid-flight-tile-output`` carrying every field the
mock-suite-sat-service ingest schema requires (Mode B Fact #105).
3. After replay, the test simulates a landing event (mechanism is
public-input — ``simulate_landing()`` MAVLink command, owned by
AZ-595 fixture builder); the SUT then uploads every generated
tile to ``mock-suite-sat-service``.
4. The test parses the FDR archive for generated tiles, fetches the
mock-service audit log, and asserts:
* AC-1: ≥ 1 tile per ~3 s of high-quality nav frames.
* AC-2: every tile has all Mode B Fact #105 fields populated.
* AC-3: no two tiles share footprint within ±1 m AND GSD within ±5 %.
* AC-4: every generated tile_id is in the audit log (HTTP 202).
* AC-7: parameterised across ``(fc_adapter, vio_strategy)``.
FT-N-06 (AC-5/AC-6) is a separate file: ``test_ft_n_06_mid_flight_freshness.py``.
Gated on:
* ``sitl_replay_ready`` — full replay requires the SITL fixture.
* ``runner.helpers.mid_flight_tile_evaluator`` — pure-logic evaluator
covered by ``e2e/_unit_tests/helpers/test_mid_flight_tile_evaluator.py``.
* ``runner.helpers.mock_suite_sat_audit.fetch_audit`` — HTTP wrapper
covered by ``e2e/_unit_tests/helpers/test_mock_suite_sat_audit.py``.
* ``FT_P_17_HIGH_QUALITY_WINDOW_S_ENV`` — the fixture builder records
the total wall-clock seconds of high-quality nav frames produced
by the replay (per AC-2.1a normal-segment criterion). Without this
env var the scenario can't compute the AC-1 denominator and skips.
"""
from __future__ import annotations
import os
from pathlib import Path
import pytest
from runner.helpers import mid_flight_tile_evaluator as mfe
from runner.helpers import mock_suite_sat_audit
FT_P_17_HIGH_QUALITY_WINDOW_S_ENV = "FT_P_17_HIGH_QUALITY_WINDOW_S"
@pytest.mark.traces_to("AC-8.4,AC-1,AC-2,AC-3,AC-4,AC-7")
def test_ft_p_17_mid_flight_tiles(
fc_adapter: str,
vio_strategy: str,
evidence_dir, # type: ignore[no-untyped-def]
run_id: str,
nfr_recorder, # type: ignore[no-untyped-def]
sitl_replay_ready: bool,
mock_suite_sat_url: str,
) -> None:
"""Full FT-P-17 scenario (AC-8.4)."""
if not sitl_replay_ready:
pytest.skip(
"FT-P-17 requires `E2E_SITL_REPLAY_DIR` to point at a SITL replay "
"fixture exposing `mid-flight-tile-output` FDR records and the "
"post-landing audit population on mock-suite-sat-service "
"(AZ-595 + AZ-422 fixture builder). Pure-logic AC-8.4 coverage "
"lives in e2e/_unit_tests/helpers/test_mid_flight_tile_evaluator.py."
)
high_quality_window_s_str = os.environ.get(FT_P_17_HIGH_QUALITY_WINDOW_S_ENV)
if not high_quality_window_s_str:
pytest.skip(
f"FT-P-17 needs `{FT_P_17_HIGH_QUALITY_WINDOW_S_ENV}` env var "
"(total wall-clock seconds of high-quality nav frames per "
"AC-2.1a). The fixture builder records this from the replay's "
"segment-quality FDR records."
)
try:
high_quality_window_s = float(high_quality_window_s_str)
except ValueError as exc:
pytest.fail(
f"FT-P-17: `{FT_P_17_HIGH_QUALITY_WINDOW_S_ENV}` must parse as "
f"float; got {high_quality_window_s_str!r}: {exc}"
)
from runner.helpers import fdr_reader
fdr_root = Path(evidence_dir).parent / f"run-{run_id}" / "fdr"
tiles = list(_extract_tiles_from_fdr(fdr_reader, fdr_root))
if not tiles:
pytest.fail(
f"FT-P-17: no `{mfe.MID_FLIGHT_TILE_FDR_KIND}` FDR records under "
f"{fdr_root}. The SUT must generate at least one tile per AC-1."
)
audit_entries = mock_suite_sat_audit.fetch_audit(mock_suite_sat_url, run_id=run_id)
rate_report = mfe.evaluate_tile_generation_rate(tiles, high_quality_window_s)
quality_report = mfe.evaluate_tile_quality_metadata(tiles)
dedup_report = mfe.evaluate_dedup(tiles)
upload_report = mfe.evaluate_upload_acks(tiles, audit_entries)
nfr_recorder.record_metric(
"ft_p_17.tile_count", float(rate_report.tile_count), ac_id="AC-1"
)
nfr_recorder.record_metric(
"ft_p_17.observed_rate_per_3s", rate_report.observed_rate_per_3s, ac_id="AC-1"
)
nfr_recorder.record_metric(
"ft_p_17.high_quality_window_s", high_quality_window_s, ac_id="AC-1"
)
nfr_recorder.record_metric(
"ft_p_17.tile_quality_failures",
float(len(quality_report.failing_entries)),
ac_id="AC-2",
)
nfr_recorder.record_metric(
"ft_p_17.dedup_duplicate_pairs",
float(dedup_report.duplicate_count),
ac_id="AC-3",
)
nfr_recorder.record_metric(
"ft_p_17.audit_missing_count",
float(len(upload_report.missing_from_audit)),
ac_id="AC-4",
)
assert rate_report.passes, (
f"AC-1 (≥1 tile per {mfe.MIN_TILES_PER_HIGH_QUALITY_WINDOW_S} s) failed: "
f"{rate_report.tile_count} tiles over {high_quality_window_s} s "
f"high-quality window → rate={rate_report.observed_rate_per_3s:.3f}/3s"
)
assert quality_report.passes, (
"AC-2 (every tile has Mode B Fact #105 quality fields) failed: "
f"failures={[(e.tile_id, e.missing_top_level_fields, e.missing_quality_fields) for e in quality_report.failing_entries]}"
)
assert dedup_report.passes, (
"AC-3 (no duplicate footprint+GSD bins) failed: "
f"duplicate_pairs={dedup_report.duplicate_pairs}"
)
assert upload_report.passes, (
"AC-4 (landing-event upload accepted) failed: "
f"generated={len(upload_report.generated_tile_ids)}, "
f"audited={len(upload_report.audit_tile_ids)}, "
f"missing={upload_report.missing_from_audit}"
)
def _extract_tiles_from_fdr(fdr_reader, fdr_root: Path): # type: ignore[no-untyped-def]
"""Yield ``TileSpec``s from every ``mid-flight-tile-output`` FDR record.
Each record's payload mirrors the mock-suite-sat-service TilePublishRequest
shape; the scenario only projects it onto a ``TileSpec`` and lets the
evaluators do the AC math.
"""
for rec in fdr_reader.iter_records(fdr_root):
if rec.record_type != mfe.MID_FLIGHT_TILE_FDR_KIND:
continue
p = rec.payload
try:
bbox = tuple(p["bbox_wgs84"]) # type: ignore[index]
except (KeyError, TypeError):
continue
if len(bbox) != 4:
continue
quality = p.get("quality") if isinstance(p.get("quality"), dict) else {}
capture_utc: str | None = None
if isinstance(quality, dict):
raw_capture = quality.get("capture_utc")
if isinstance(raw_capture, str):
capture_utc = raw_capture
yield mfe.TileSpec(
tile_id=str(p.get("tile_id") or ""),
bbox_wgs84=(float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])),
zoom_level=int(p.get("zoom_level") or 0),
descriptor_sha256=str(p.get("descriptor_sha256") or ""),
payload_size_bytes=int(p.get("payload_size_bytes") or 0),
quality=dict(quality) if isinstance(quality, dict) else {},
generated_at_monotonic_ms=int(rec.monotonic_ms),
capture_utc_iso=capture_utc,
)