Files
gps-denied-onboard/e2e/runner/helpers/mid_flight_tile_evaluator.py
T
Oleksandr Bezdieniezhnykh 5def1a3eb3 [AZ-422] Add FT-P-17 + FT-N-06 mid-flight tile blackbox tests
Implement the AC-8.4 and AC-NEW-6 blackbox scenarios for mid-flight
tile generation, dedup, landing-time upload, and freshness gating.

Helpers:
- runner/helpers/mid_flight_tile_evaluator.py — pure-logic evaluators
  for tile generation rate, Mode B Fact #105 schema check, footprint+
  GSD dedup (via geo.distance_m), upload-audit reconciliation, and
  the AC-5/AC-6 capture_utc + freshness-gate checks.
- runner/helpers/mock_suite_sat_audit.py — httpx wrapper for the
  mock-suite-sat-service /tiles/audit endpoint with strict response-
  shape validation.

Scenarios:
- tests/positive/test_ft_p_17_mid_flight_tiles.py
- tests/negative/test_ft_n_06_mid_flight_freshness.py

Both skip when sitl_replay_ready is false and fail loudly when fixture
records are missing (tests-as-gates discipline). 52 new unit tests
(41 evaluator + 11 audit client) cover every helper branch.

Review: PASS_WITH_WARNINGS (2 Low — duplicate haversine carry-over,
upstream production dependency surface).

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-17 15:28:39 +03:00

501 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Mid-flight tile generation + freshness evaluators (AZ-422 / FT-P-17 + FT-N-06).
Pure-logic evaluators sourced from the FDR archive (per-tile generation
records + freshness-gate events) and the mock-suite-sat-service audit
log (landing-time upload acks).
Sub-scenarios:
* **FT-P-17 / AC-8.4** — five evaluators:
* generation cadence (≥ 1 tile / 3 s of high-quality nav frames);
* quality-metadata sufficiency (per-tile fields the Service voting
layer needs — Mode B Fact #105: capture_utc, source_provider,
resolution_m_per_px, cloud_coverage_pct, geo_accuracy_m, plus
publish-request fields: tile_id, bbox_wgs84, zoom_level,
descriptor_sha256, payload_size_bytes);
* dedup (no two tiles share footprint within ±1 m AND GSD within
±5 %);
* landing-event upload (every generated tile has an audit entry
in the mock-suite-sat-service).
* **FT-N-06 / AC-NEW-6** — two evaluators:
* capture-date freshness (|capture_utc generated_at| ≤ 60 s);
* freshness-gate (no ``tile-load-rejected: stale`` FDR event for a
freshly generated tile).
All evaluators consume Python dataclasses / dicts. The HTTP fetch
and FDR walk live in scenario tests; this module only decides whether
the parsed inputs satisfy the AC.
Public-boundary discipline: NO imports from ``src/gps_denied_onboard``.
"""
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Iterable, Sequence
from .geo import distance_m
# ─────────────────────── FDR record kinds & schema ───────────────────────
MID_FLIGHT_TILE_FDR_KIND = "mid-flight-tile-output"
TILE_LOAD_REJECTED_FDR_KIND = "tile-load-rejected"
TILE_LOAD_REJECTED_STALE_REASON = "stale"
MIN_TILES_PER_HIGH_QUALITY_WINDOW_S = 3.0 # ≥ 1 tile per ~3 s of high-quality nav frames
CAPTURE_DATE_FRESHNESS_TOLERANCE_S = 60.0
DEDUP_FOOTPRINT_TOLERANCE_M = 1.0
DEDUP_GSD_TOLERANCE_FRACTION = 0.05 # ±5 %
# Schema mirror — must stay in sync with ``e2e/fixtures/mock-suite-sat/app.py``
# ``TilePublishRequest`` + ``TileQualityMetadata``.
TILE_REQUIRED_TOP_LEVEL_FIELDS: tuple[str, ...] = (
"tile_id",
"bbox_wgs84",
"zoom_level",
"descriptor_sha256",
"payload_size_bytes",
"quality",
)
TILE_REQUIRED_QUALITY_FIELDS: tuple[str, ...] = (
"capture_utc",
"source_provider",
"resolution_m_per_px",
"cloud_coverage_pct",
"geo_accuracy_m",
)
@dataclass(frozen=True)
class TileSpec:
"""Public-boundary projection of one mid-flight-tile-output record.
Sourced from the FDR ``mid-flight-tile-output`` record. Mirrors the
TilePublishRequest schema so the same dataclass feeds both the
landing-event upload comparison (AC-4) and the per-tile evaluators.
``bbox_wgs84`` is ``(west_lon, south_lat, east_lon, north_lat)``
matching the mock-suite-sat-service contract.
``generated_at_monotonic_ms`` is the SUT's emission timestamp from
the FDR envelope's ``ts`` (projected to monotonic ms by the FDR
reader). ``capture_utc_iso`` is the per-tile field — they should
agree within ``CAPTURE_DATE_FRESHNESS_TOLERANCE_S`` (FT-N-06).
"""
tile_id: str
bbox_wgs84: tuple[float, float, float, float]
zoom_level: int
descriptor_sha256: str
payload_size_bytes: int
quality: dict[str, object]
generated_at_monotonic_ms: int
capture_utc_iso: str | None = None # convenience accessor; same as quality["capture_utc"]
def bbox_centre(bbox: tuple[float, float, float, float]) -> tuple[float, float]:
"""Return ``(lat, lon)`` of a WGS84 bbox ``(west, south, east, north)``."""
west, south, east, north = bbox
return ((south + north) / 2.0, (west + east) / 2.0)
# ─────────────────────────── FT-P-17 / AC-1 ───────────────────────────
@dataclass(frozen=True)
class TileGenerationRateReport:
"""AC-1 of FT-P-17: ≥ 1 tile per ~3 s of high-quality nav frames."""
tile_count: int
high_quality_window_s: float
observed_rate_per_3s: float
min_required_rate_per_3s: float = 1.0
@property
def passes(self) -> bool:
if self.high_quality_window_s <= 0:
return False
return self.observed_rate_per_3s >= self.min_required_rate_per_3s
def evaluate_tile_generation_rate(
tiles: Sequence[TileSpec],
high_quality_window_s: float,
*,
window_s_per_tile: float = MIN_TILES_PER_HIGH_QUALITY_WINDOW_S,
) -> TileGenerationRateReport:
"""AC-1: rate of generated tiles over the high-quality nav-frame window.
``high_quality_window_s`` is the total wall-clock seconds during the
replay that produced "high-quality" nav frames (defined by AC-2.1a
normal-segment in `_docs/02_document/tests/blackbox-tests.md`).
The scenario test computes this from the FDR's segment-quality
records; the helper only divides.
The AC threshold is ≥ 1 tile per ``window_s_per_tile`` seconds.
Normalised to a "tiles per 3 s" rate so the report is unitless.
"""
if window_s_per_tile <= 0:
raise ValueError(f"window_s_per_tile must be > 0, got {window_s_per_tile}")
if high_quality_window_s <= 0:
return TileGenerationRateReport(
tile_count=len(tiles),
high_quality_window_s=high_quality_window_s,
observed_rate_per_3s=0.0,
)
rate = (len(tiles) / high_quality_window_s) * window_s_per_tile
return TileGenerationRateReport(
tile_count=len(tiles),
high_quality_window_s=high_quality_window_s,
observed_rate_per_3s=rate,
)
# ─────────────────────────── FT-P-17 / AC-2 ───────────────────────────
@dataclass(frozen=True)
class TileQualityEntryReport:
"""Per-tile schema-completeness result."""
tile_id: str
missing_top_level_fields: tuple[str, ...]
missing_quality_fields: tuple[str, ...]
@property
def passes(self) -> bool:
return not self.missing_top_level_fields and not self.missing_quality_fields
@dataclass(frozen=True)
class TileQualityReport:
"""AC-2 of FT-P-17: every tile carries the Mode B Fact #105 fields."""
entries: tuple[TileQualityEntryReport, ...]
@property
def failing_entries(self) -> tuple[TileQualityEntryReport, ...]:
return tuple(e for e in self.entries if not e.passes)
@property
def passes(self) -> bool:
if not self.entries:
return False
return not self.failing_entries
def evaluate_tile_quality_metadata(
tiles: Sequence[TileSpec],
*,
required_top_level: Sequence[str] = TILE_REQUIRED_TOP_LEVEL_FIELDS,
required_quality: Sequence[str] = TILE_REQUIRED_QUALITY_FIELDS,
) -> TileQualityReport:
"""AC-2: every tile has all top-level + quality fields populated.
"Populated" means the key is present in the underlying dict
representation AND the value is not ``None``. A ``TileSpec``
constructed by the scenario test from the FDR record carries
these fields as dataclass attributes; this helper still re-checks
the quality dict for completeness because the dict mirror is the
actual contract with the Service voting layer.
"""
entries: list[TileQualityEntryReport] = []
for tile in tiles:
missing_top: list[str] = []
for f in required_top_level:
if f == "quality":
continue
value = getattr(tile, _top_level_field_to_attr(f), None)
if value is None:
missing_top.append(f)
missing_quality: list[str] = []
if not isinstance(tile.quality, dict):
missing_quality = list(required_quality)
else:
for f in required_quality:
if f not in tile.quality or tile.quality[f] is None:
missing_quality.append(f)
entries.append(
TileQualityEntryReport(
tile_id=tile.tile_id or "<unknown>",
missing_top_level_fields=tuple(missing_top),
missing_quality_fields=tuple(missing_quality),
)
)
return TileQualityReport(entries=tuple(entries))
def _top_level_field_to_attr(field: str) -> str:
"""Map TilePublishRequest field name to the TileSpec attribute."""
return field # 1:1 mapping; documented for future drift handling
# ─────────────────────────── FT-P-17 / AC-3 ───────────────────────────
@dataclass(frozen=True)
class TileDedupReport:
"""AC-3 of FT-P-17: no two tiles share a (footprint, GSD) bin."""
duplicate_pairs: tuple[tuple[str, str], ...]
footprint_tolerance_m: float = DEDUP_FOOTPRINT_TOLERANCE_M
gsd_tolerance_fraction: float = DEDUP_GSD_TOLERANCE_FRACTION
@property
def duplicate_count(self) -> int:
return len(self.duplicate_pairs)
@property
def passes(self) -> bool:
return self.duplicate_count == 0
def evaluate_dedup(
tiles: Sequence[TileSpec],
*,
footprint_tolerance_m: float = DEDUP_FOOTPRINT_TOLERANCE_M,
gsd_tolerance_fraction: float = DEDUP_GSD_TOLERANCE_FRACTION,
) -> TileDedupReport:
"""AC-3: pair-wise dedup check.
Two tiles are duplicates iff:
* Vincenty distance between their bbox centres ≤ ``footprint_tolerance_m`` AND
* ``|gsd_a gsd_b| / max(gsd_a, gsd_b) ≤ gsd_tolerance_fraction``
O(N²) — fine for the < 100 tiles per 5 min replay scenarios produce.
Returns the offending ``(tile_id, tile_id)`` pairs.
"""
if footprint_tolerance_m < 0:
raise ValueError(f"footprint_tolerance_m must be ≥0, got {footprint_tolerance_m}")
if gsd_tolerance_fraction < 0:
raise ValueError(
f"gsd_tolerance_fraction must be ≥0, got {gsd_tolerance_fraction}"
)
centres: list[tuple[float, float]] = [bbox_centre(t.bbox_wgs84) for t in tiles]
gsds: list[float | None] = [_extract_gsd(t) for t in tiles]
pairs: list[tuple[str, str]] = []
for i in range(len(tiles)):
gsd_i = gsds[i]
if gsd_i is None:
continue
for j in range(i + 1, len(tiles)):
gsd_j = gsds[j]
if gsd_j is None:
continue
denom = max(gsd_i, gsd_j)
if denom == 0:
continue
gsd_delta_fraction = abs(gsd_i - gsd_j) / denom
if gsd_delta_fraction > gsd_tolerance_fraction:
continue
d_m = distance_m(
centres[i][0], centres[i][1], centres[j][0], centres[j][1]
)
if d_m <= footprint_tolerance_m:
pairs.append((tiles[i].tile_id, tiles[j].tile_id))
return TileDedupReport(
duplicate_pairs=tuple(pairs),
footprint_tolerance_m=footprint_tolerance_m,
gsd_tolerance_fraction=gsd_tolerance_fraction,
)
def _extract_gsd(tile: TileSpec) -> float | None:
"""Pull GSD (resolution_m_per_px) from the tile's quality dict."""
if not isinstance(tile.quality, dict):
return None
raw = tile.quality.get("resolution_m_per_px")
if isinstance(raw, (int, float)):
return float(raw)
return None
# ─────────────────────────── FT-P-17 / AC-4 ───────────────────────────
@dataclass(frozen=True)
class TileUploadAckReport:
"""AC-4 of FT-P-17: every generated tile uploaded with HTTP 202."""
generated_tile_ids: tuple[str, ...]
audit_tile_ids: tuple[str, ...]
missing_from_audit: tuple[str, ...]
@property
def passes(self) -> bool:
if not self.generated_tile_ids:
return False
return not self.missing_from_audit
def evaluate_upload_acks(
generated_tiles: Sequence[TileSpec],
audit_entries: Sequence[dict],
) -> TileUploadAckReport:
"""AC-4: every generated tile_id appears in the mock-suite-sat-service audit.
The mock-suite-sat-service ``POST /tiles`` endpoint records HTTP 202
responses to its run-scoped audit log; a tile that did not return
202 (i.e., was rejected with 400 or any forced-5xx) is NOT in the
audit. So a tile_id present in ``generated_tiles`` but absent from
``audit_entries`` is by construction a missing ack.
``audit_entries`` is the ``entries`` field of the JSON response from
``GET /tiles/audit?run_id=<RUN_ID>``.
"""
generated_ids = tuple(t.tile_id for t in generated_tiles)
audit_ids = tuple(
e["tile_id"] for e in audit_entries if isinstance(e, dict) and "tile_id" in e
)
audit_id_set = set(audit_ids)
missing = tuple(tid for tid in generated_ids if tid not in audit_id_set)
return TileUploadAckReport(
generated_tile_ids=generated_ids,
audit_tile_ids=audit_ids,
missing_from_audit=missing,
)
# ─────────────────────────── FT-N-06 / AC-5 ───────────────────────────
@dataclass(frozen=True)
class CaptureDateFreshnessEntryReport:
"""Per-tile drift between ``capture_utc`` and ``generated_at``.
Whether the drift passes the AC threshold is decided at the
``CaptureDateFreshnessReport`` level because the tolerance is a
report-wide knob (AC-5 stipulates 60 s globally).
"""
tile_id: str
drift_s: float | None # None when capture_utc cannot be parsed
@dataclass(frozen=True)
class CaptureDateFreshnessReport:
"""AC-5 of FT-N-06: |capture_utc - generated_at_wall_clock| ≤ 60 s."""
entries: tuple[CaptureDateFreshnessEntryReport, ...]
tolerance_s: float = CAPTURE_DATE_FRESHNESS_TOLERANCE_S
@property
def failing_entries(self) -> tuple[CaptureDateFreshnessEntryReport, ...]:
return tuple(
e for e in self.entries
if e.drift_s is None or abs(e.drift_s) > self.tolerance_s
)
@property
def passes(self) -> bool:
if not self.entries:
return False
return not self.failing_entries
def evaluate_capture_date_freshness(
tiles: Sequence[TileSpec],
*,
tolerance_s: float = CAPTURE_DATE_FRESHNESS_TOLERANCE_S,
) -> CaptureDateFreshnessReport:
"""AC-5: per-tile capture_utc drift against generated_at_monotonic_ms.
Drift is signed: ``capture_utc generated_at``. A drift of +0 is
"capture happened at generation"; negative drift means capture
happened BEFORE generation (the usual direction — capture is
instantaneous, generation is the orthorectification step that
follows).
A tile whose ``capture_utc`` cannot be parsed as ISO 8601 records
drift_s = None and fails the AC.
"""
if tolerance_s <= 0:
raise ValueError(f"tolerance_s must be > 0, got {tolerance_s}")
entries: list[CaptureDateFreshnessEntryReport] = []
for tile in tiles:
capture_str = tile.capture_utc_iso
if capture_str is None and isinstance(tile.quality, dict):
raw = tile.quality.get("capture_utc")
if isinstance(raw, str):
capture_str = raw
drift: float | None
if capture_str is None:
drift = None
else:
parsed = _parse_iso8601_utc_seconds(capture_str)
if parsed is None:
drift = None
else:
drift = parsed - (tile.generated_at_monotonic_ms / 1000.0)
entries.append(
CaptureDateFreshnessEntryReport(tile_id=tile.tile_id, drift_s=drift)
)
return CaptureDateFreshnessReport(
entries=tuple(entries), tolerance_s=tolerance_s
)
def _parse_iso8601_utc_seconds(ts: str) -> float | None:
"""Parse ISO 8601 ``ts`` into seconds-since-epoch; ``None`` on failure.
Accepts the trailing ``Z`` shorthand that ``datetime.fromisoformat``
did not accept until 3.11.
"""
try:
normalised = ts[:-1] + "+00:00" if ts.endswith("Z") else ts
dt = datetime.fromisoformat(normalised)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.timestamp()
except (TypeError, ValueError):
return None
# ─────────────────────────── FT-N-06 / AC-6 ───────────────────────────
@dataclass(frozen=True)
class FreshnessGateReport:
"""AC-6 of FT-N-06: no `tile-load-rejected: stale` for freshly generated tiles."""
generated_tile_ids: tuple[str, ...]
stale_rejections: tuple[str, ...]
@property
def passes(self) -> bool:
return not self.stale_rejections
def evaluate_freshness_gate(
generated_tiles: Sequence[TileSpec],
fdr_rejection_records: Iterable[dict],
*,
stale_reason: str = TILE_LOAD_REJECTED_STALE_REASON,
) -> FreshnessGateReport:
"""AC-6: any ``tile-load-rejected: stale`` for a freshly generated tile fails.
``fdr_rejection_records`` is the payload dict of each FDR record whose
``record_type == TILE_LOAD_REJECTED_FDR_KIND``. A "stale" rejection
sets ``reason == "stale"``. If the rejected tile_id matches a
generated tile_id, the freshness gate misclassified it.
"""
generated_ids = tuple(t.tile_id for t in generated_tiles)
gen_id_set = set(generated_ids)
stale: list[str] = []
for payload in fdr_rejection_records:
if not isinstance(payload, dict):
continue
reason = payload.get("reason")
if reason != stale_reason:
continue
tile_id = payload.get("id") or payload.get("tile_id")
if isinstance(tile_id, str) and tile_id in gen_id_set:
stale.append(tile_id)
return FreshnessGateReport(
generated_tile_ids=generated_ids, stale_rejections=tuple(stale)
)