Files
gps-denied-onboard/e2e/runner/helpers/accuracy_evaluator.py
T
Oleksandr Bezdieniezhnykh 29ac16cfcb [AZ-409] [AZ-412] [AZ-413] Batch 70: FT-P-01/04/05/06 scenarios
AZ-409 (3pt) — FT-P-01 still-image frame-center accuracy:
- accuracy_evaluator.py: GT loader + Vincenty error + AC-2/AC-3 pass-counts
- test_ft_p_01_still_image_accuracy.py: scenario gated on frame_source_replay
  + sitl_observer NotImplementedError; AC-4 timeout discipline

AZ-412 (3pt) — FT-P-04 Derkachi f2f registration >=95% on normal segments:
- registration_classifier.py: accel-derived attitude + overlap heuristic
  + success ratio with AC-3 sharp-turn exclusion
- test_ft_p_04_derkachi_f2f_registration.py: scenario gated on
  frame_source_replay + imu_replay + fdr_reader

AZ-413 (3pt) — FT-P-05 + FT-P-06 cross-domain MRE budgets:
- mre_evaluator.py: per-image budget (strict <2.5px) + 95th-percentile
  via numpy linear interp + combined report
- test_ft_p_05_sat_anchor.py: cross-domain scenario, reuses
  accuracy_evaluator for geodesic join
- test_ft_p_06_mre_budgets.py: pure piggyback on FT-P-04 + FT-P-05 CSV
  evidence; skips when either upstream CSV missing

Tests: 325 unit tests pass (+77 vs batch 69).
Reports: batch_70_report.md, batch_70_review.md (PASS).
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 18:10:46 +03:00

257 lines
8.2 KiB
Python

"""Per-image accuracy evaluation for FT-P-01 (AZ-409 — AC-1.1, AC-1.2).
Consumes a list of ``(image_id, est_lat, est_lon)`` estimates produced by
the SUT during a 60-image still-image push, joins against the ground-truth
``coordinates.csv`` shipped with the project, computes Vincenty geodesic
distance per image, and reports the AC-2 / AC-3 pass-counts.
The helper is **transport-agnostic**: the scenario test reads the per-image
estimates from the SITL observer (or post-run FDR archive) and hands a
typed list to ``evaluate()`` — no SUT import.
The pass-count thresholds come from the spec's
``expected_results/results_report.md`` Pass/Fail Rules:
* AC-2 (50 m budget): ≥48 / 60 images pass (80 %).
* AC-3 (20 m budget): ≥30 / 60 images pass (50 %).
Timeout discipline (AC-4): when the SITL listener times out for an image,
the scenario passes ``est_lat = est_lon = float('inf')``; ``evaluate()``
records ``error_m = inf``, ``pass_50m = False``, ``pass_20m = False`` for
that image. The aggregate may still pass if other images carry the count.
Public-boundary discipline: this module does NOT import any
``src/gps_denied_onboard`` symbol.
"""
from __future__ import annotations
import csv
import math
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Sequence
from .geo import distance_m
PASS_COUNT_50M_REQUIRED = 48
PASS_COUNT_20M_REQUIRED = 30
TOTAL_IMAGES_REQUIRED = 60
@dataclass(frozen=True)
class GtCoordinate:
"""Ground-truth WGS84 frame-center coordinate for one still image."""
image_id: str
lat_deg: float
lon_deg: float
@dataclass(frozen=True)
class EstimateInput:
"""One outbound estimate observed at the SITL listener.
For a timed-out image (no message received within the scenario's 5 s
budget) the scenario passes ``est_lat = est_lon = float('inf')``;
``evaluate()`` records ``error_m = inf`` and both pass flags False.
"""
image_id: str
est_lat_deg: float
est_lon_deg: float
@dataclass(frozen=True)
class PerImageResult:
"""Per-image evaluation row written to ``ft-p-01.csv``."""
image_id: str
gt_lat: float
gt_lon: float
est_lat: float
est_lon: float
error_m: float
pass_50m: bool
pass_20m: bool
@dataclass(frozen=True)
class AggregateReport:
"""Aggregate pass-count over a 60-image run; drives the scenario assertion."""
total_images: int
pass_count_50m: int
pass_count_20m: int
timeout_count: int
pass_50m_required: int = PASS_COUNT_50M_REQUIRED
pass_20m_required: int = PASS_COUNT_20M_REQUIRED
@property
def pass_ac2(self) -> bool:
"""AC-2: ≥48 / 60 pass the 50 m budget."""
return self.pass_count_50m >= self.pass_50m_required
@property
def pass_ac3(self) -> bool:
"""AC-3: ≥30 / 60 pass the 20 m budget."""
return self.pass_count_20m >= self.pass_20m_required
@property
def overall_pass(self) -> bool:
"""Scenario passes iff both AC-2 and AC-3 hold."""
return self.pass_ac2 and self.pass_ac3
def load_gt_coordinates(csv_path: Path) -> list[GtCoordinate]:
"""Parse the project's ``coordinates.csv``.
Header format: ``image, lat, lon`` (with the project's whitespace
around commas — tolerated).
"""
if not csv_path.exists():
raise FileNotFoundError(
f"coordinates.csv not found at {csv_path} — check the bind-mount or repo path"
)
rows: list[GtCoordinate] = []
with csv_path.open() as fh:
reader = csv.reader(fh)
header = next(reader)
normalised_header = [c.strip() for c in header]
expected = ["image", "lat", "lon"]
if normalised_header != expected:
raise ValueError(
f"coordinates.csv header mismatch: expected {expected}, got {normalised_header}"
)
for raw in reader:
if not raw:
continue
image_id, lat_str, lon_str = (c.strip() for c in raw)
rows.append(
GtCoordinate(
image_id=image_id,
lat_deg=float(lat_str),
lon_deg=float(lon_str),
)
)
return rows
def _is_timeout(value: float) -> bool:
"""An est_lat or est_lon of inf marks an AC-4 timeout."""
return math.isinf(value)
def compute_per_image(
gt: GtCoordinate, estimate: EstimateInput
) -> PerImageResult:
"""Compute error_m + AC-2/AC-3 pass flags for one image."""
if gt.image_id != estimate.image_id:
raise ValueError(
f"image_id mismatch: gt='{gt.image_id}' estimate='{estimate.image_id}'"
)
if _is_timeout(estimate.est_lat_deg) or _is_timeout(estimate.est_lon_deg):
return PerImageResult(
image_id=gt.image_id,
gt_lat=gt.lat_deg,
gt_lon=gt.lon_deg,
est_lat=estimate.est_lat_deg,
est_lon=estimate.est_lon_deg,
error_m=math.inf,
pass_50m=False,
pass_20m=False,
)
err = distance_m(gt.lat_deg, gt.lon_deg, estimate.est_lat_deg, estimate.est_lon_deg)
return PerImageResult(
image_id=gt.image_id,
gt_lat=gt.lat_deg,
gt_lon=gt.lon_deg,
est_lat=estimate.est_lat_deg,
est_lon=estimate.est_lon_deg,
error_m=err,
pass_50m=err <= 50.0,
pass_20m=err <= 20.0,
)
def evaluate(
gt_rows: Sequence[GtCoordinate],
estimates: Sequence[EstimateInput],
) -> tuple[list[PerImageResult], AggregateReport]:
"""Join GT + estimates by image_id, compute per-image + aggregate.
The GT order is authoritative — the resulting list is in GT order so
the CSV column is stable across runs. An estimate without a matching
GT row is an error (the scenario should not push a stranger image);
a GT row without a matching estimate is a timeout (recorded with inf).
"""
by_id = {e.image_id: e for e in estimates}
if len(by_id) != len(estimates):
seen: set[str] = set()
dupes: list[str] = []
for e in estimates:
if e.image_id in seen:
dupes.append(e.image_id)
seen.add(e.image_id)
raise ValueError(f"duplicate estimate image_ids: {sorted(set(dupes))}")
stranger_ids = sorted(set(by_id) - {g.image_id for g in gt_rows})
if stranger_ids:
raise ValueError(
f"estimate(s) for image_id(s) not in GT: {stranger_ids}"
)
results: list[PerImageResult] = []
timeout_count = 0
for gt in gt_rows:
est = by_id.get(gt.image_id)
if est is None:
est = EstimateInput(image_id=gt.image_id, est_lat_deg=math.inf, est_lon_deg=math.inf)
timeout_count += 1
elif _is_timeout(est.est_lat_deg) or _is_timeout(est.est_lon_deg):
timeout_count += 1
results.append(compute_per_image(gt, est))
aggregate = AggregateReport(
total_images=len(results),
pass_count_50m=sum(1 for r in results if r.pass_50m),
pass_count_20m=sum(1 for r in results if r.pass_20m),
timeout_count=timeout_count,
)
return results, aggregate
def write_csv_evidence(out_path: Path, results: Iterable[PerImageResult]) -> Path:
"""Write the FT-P-01 per-image evidence CSV.
Header: ``image_id, gt_lat, gt_lon, est_lat, est_lon, error_m, pass_50m, pass_20m``.
"""
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", newline="") as fh:
writer = csv.writer(fh)
writer.writerow(
[
"image_id",
"gt_lat",
"gt_lon",
"est_lat",
"est_lon",
"error_m",
"pass_50m",
"pass_20m",
]
)
for r in results:
writer.writerow(
[
r.image_id,
f"{r.gt_lat:.6f}",
f"{r.gt_lon:.6f}",
"inf" if math.isinf(r.est_lat) else f"{r.est_lat:.6f}",
"inf" if math.isinf(r.est_lon) else f"{r.est_lon:.6f}",
"inf" if math.isinf(r.error_m) else f"{r.error_m:.3f}",
"true" if r.pass_50m else "false",
"true" if r.pass_20m else "false",
]
)
return out_path