[AZ-699] Real-flight validation runner + Markdown accuracy report

New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).

Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.

16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-20 16:53:48 +03:00
parent f5366bbca1
commit dcde602f61
9 changed files with 1261 additions and 2 deletions
@@ -18,8 +18,11 @@ from gps_denied_onboard.helpers.engine_filename_schema import (
)
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
HorizontalErrorDistribution,
horizontal_error_distribution,
l2_horizontal_m,
match_percentage,
percentile_sorted,
)
from gps_denied_onboard.helpers.imu_preintegrator import (
CombinedImuFactor,
@@ -77,6 +80,7 @@ __all__ = [
"EngineFilenameSchema",
"EngineFilenameSchemaError",
"GroundTruthRow",
"HorizontalErrorDistribution",
"ImuPreintegrationError",
"ImuPreintegrator",
"LightGlueConcurrentAccessError",
@@ -92,6 +96,7 @@ __all__ = [
"WgsConverter",
"adjoint",
"exp_map",
"horizontal_error_distribution",
"is_valid_rotation",
"iso_ts_from_clock",
"iso_ts_now",
@@ -100,5 +105,6 @@ __all__ = [
"match_percentage",
"make_imu_preintegrator",
"matrix_to_se3",
"percentile_sorted",
"se3_to_matrix",
]
@@ -19,8 +19,11 @@ from typing import Any
__all__ = [
"GroundTruthRow",
"HorizontalErrorDistribution",
"horizontal_error_distribution",
"l2_horizontal_m",
"match_percentage",
"percentile_sorted",
]
@@ -118,3 +121,162 @@ def _bisect_left(seq: list[float], target: float) -> int:
else:
hi = mid
return lo
# ---------------------------------------------------------------------
# AZ-699: accuracy distribution helpers for the real-flight runner
# ---------------------------------------------------------------------
# Threshold-hit buckets for the AZ-696 epic's distance gates. Reported
# in this exact order so report tables stay stable across runs.
_DEFAULT_THRESHOLDS_M: tuple[float, ...] = (10.0, 25.0, 50.0, 100.0)
@dataclass(frozen=True)
class HorizontalErrorDistribution:
"""Aggregate horizontal-error stats for one estimator run.
All distances are great-circle metres (see :func:`l2_horizontal_m`).
``threshold_hit_share`` keys map the buckets in
:data:`_DEFAULT_THRESHOLDS_M`; values are share-in-``[0.0, 1.0]``.
``count`` is the number of emissions that contributed (i.e. each
emission has at least one ground-truth candidate to compare to).
The ``vertical_*`` fields are populated when emissions carry an
``alt_m`` component AND the matched ground-truth row carries an
altitude. Missing altitudes contribute nothing — both stats and
counts skip those samples.
Style: this DTO is the source of truth for AZ-699's report writer
and for downstream HTTP-API / dashboard consumers.
"""
count: int
horizontal_error_mean_m: float
horizontal_error_p50_m: float
horizontal_error_p95_m: float
horizontal_error_p99_m: float
threshold_hit_share: dict[float, float]
vertical_count: int
vertical_error_mean_m: float
vertical_error_p50_m: float
vertical_error_p95_m: float
def percentile_sorted(values_sorted: list[float], pct: float) -> float:
"""Linear-interpolation percentile over an already-sorted list.
``pct`` is in ``[0.0, 100.0]``. Matches numpy's default
``linear`` interpolation rule so the report numbers are reproducible
by anyone with numpy or scipy at hand. Empty input → ``0.0`` so
downstream report formatters never trip on a divide-by-zero; the
caller is expected to gate on ``count > 0`` for honest reporting.
"""
if not values_sorted:
return 0.0
if pct <= 0.0:
return values_sorted[0]
if pct >= 100.0:
return values_sorted[-1]
n = len(values_sorted)
rank = (pct / 100.0) * (n - 1)
lo = int(math.floor(rank))
hi = int(math.ceil(rank))
if lo == hi:
return values_sorted[lo]
frac = rank - lo
return values_sorted[lo] * (1.0 - frac) + values_sorted[hi] * frac
def horizontal_error_distribution(
emissions: list[dict[str, Any]],
ground_truth: list[GroundTruthRow],
*,
thresholds_m: tuple[float, ...] = _DEFAULT_THRESHOLDS_M,
) -> HorizontalErrorDistribution:
"""Walk all emissions once, return aggregated horizontal-error stats.
Pairs each emission with the nearest-in-time ground-truth row using
the same bisect strategy as :func:`match_percentage`. Vertical error
is computed when both the emission record and the matched row carry
a numeric altitude.
Empty emissions produce a zero-count distribution (every numeric
field is ``0.0``). The caller is responsible for surfacing
"no emissions" as a separate fail condition before consuming the
report — the helper itself never raises on empty input so report
formatting stays defensive.
"""
if not ground_truth:
raise AssertionError("ground_truth must be non-empty")
if not emissions:
return HorizontalErrorDistribution(
count=0,
horizontal_error_mean_m=0.0,
horizontal_error_p50_m=0.0,
horizontal_error_p95_m=0.0,
horizontal_error_p99_m=0.0,
threshold_hit_share={t: 0.0 for t in thresholds_m},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
gt_sorted = sorted(ground_truth, key=lambda r: r.t_s)
gt_times = [r.t_s for r in gt_sorted]
horiz_errors: list[float] = []
vert_errors: list[float] = []
for emit in emissions:
emit_ts_ns = int(emit["emitted_at"])
emit_t_s = emit_ts_ns / 1e9
idx = _bisect_left(gt_times, emit_t_s)
candidates = []
if idx > 0:
candidates.append(gt_sorted[idx - 1])
if idx < len(gt_sorted):
candidates.append(gt_sorted[idx])
nearest = min(candidates, key=lambda r: abs(r.t_s - emit_t_s))
emit_pos = emit["position_wgs84"]
horiz_errors.append(
l2_horizontal_m(
emit_pos["lat_deg"],
emit_pos["lon_deg"],
nearest.lat_deg,
nearest.lon_deg,
)
)
emit_alt = emit_pos.get("alt_m")
if emit_alt is not None and not math.isnan(nearest.alt_m):
vert_errors.append(abs(float(emit_alt) - nearest.alt_m))
horiz_sorted = sorted(horiz_errors)
horiz_mean = sum(horiz_errors) / len(horiz_errors)
thresholds_share = {
t: sum(1 for e in horiz_errors if e <= t) / len(horiz_errors)
for t in thresholds_m
}
if vert_errors:
vert_sorted = sorted(vert_errors)
vert_mean = sum(vert_errors) / len(vert_errors)
vert_p50 = percentile_sorted(vert_sorted, 50.0)
vert_p95 = percentile_sorted(vert_sorted, 95.0)
else:
vert_mean = 0.0
vert_p50 = 0.0
vert_p95 = 0.0
return HorizontalErrorDistribution(
count=len(horiz_errors),
horizontal_error_mean_m=horiz_mean,
horizontal_error_p50_m=percentile_sorted(horiz_sorted, 50.0),
horizontal_error_p95_m=percentile_sorted(horiz_sorted, 95.0),
horizontal_error_p99_m=percentile_sorted(horiz_sorted, 99.0),
threshold_hit_share=thresholds_share,
vertical_count=len(vert_errors),
vertical_error_mean_m=vert_mean,
vertical_error_p50_m=vert_p50,
vertical_error_p95_m=vert_p95,
)