[AZ-699] Real-flight validation runner + Markdown accuracy report

New e2e test runs gps-denied-replay --auto-trim against the real derkachi.tlog + flight video + AZ-702 calibration, computes the horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m threshold-hit share), writes _docs/06_metrics/real_flight_ validation_{date}.md, and asserts honest PASS/FAIL with no @xfail mask. AZ-404's 1-min test is untouched (sibling, not replacement). Extends gps_compare.py with HorizontalErrorDistribution + percentile_sorted (numpy-equivalent linear interpolation). New test helper _report_writer.py renders the canonical Markdown schema documented as FT-P-20 in blackbox-tests.md. 16 new unit tests pin distribution arithmetic, verdict gate, failure-message templating (references calibration acquisition method per AC-3), and report layout. 129 passed in focused regression, 3 skipped (real video / Tier-2 prerequisites). Zero new mypy --strict errors. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 18:41:13 +00:00 · 2026-05-20 16:53:48 +03:00
parent f5366bbca1
commit dcde602f61
9 changed files with 1261 additions and 2 deletions
@@ -18,8 +18,11 @@ from gps_denied_onboard.helpers.engine_filename_schema import (
 )
 from gps_denied_onboard.helpers.gps_compare import (
    GroundTruthRow,
+    HorizontalErrorDistribution,
+    horizontal_error_distribution,
    l2_horizontal_m,
    match_percentage,
+    percentile_sorted,
 )
 from gps_denied_onboard.helpers.imu_preintegrator import (
    CombinedImuFactor,
@@ -77,6 +80,7 @@ __all__ = [
    "EngineFilenameSchema",
    "EngineFilenameSchemaError",
    "GroundTruthRow",
+    "HorizontalErrorDistribution",
    "ImuPreintegrationError",
    "ImuPreintegrator",
    "LightGlueConcurrentAccessError",
@@ -92,6 +96,7 @@ __all__ = [
    "WgsConverter",
    "adjoint",
    "exp_map",
+    "horizontal_error_distribution",
    "is_valid_rotation",
    "iso_ts_from_clock",
    "iso_ts_now",
@@ -100,5 +105,6 @@ __all__ = [
    "match_percentage",
    "make_imu_preintegrator",
    "matrix_to_se3",
+    "percentile_sorted",
    "se3_to_matrix",
 ]
@@ -19,8 +19,11 @@ from typing import Any

 __all__ = [
    "GroundTruthRow",
+    "HorizontalErrorDistribution",
+    "horizontal_error_distribution",
    "l2_horizontal_m",
    "match_percentage",
+    "percentile_sorted",
 ]


@@ -118,3 +121,162 @@ def _bisect_left(seq: list[float], target: float) -> int:
        else:
            hi = mid
    return lo
+
+
+# ---------------------------------------------------------------------
+# AZ-699: accuracy distribution helpers for the real-flight runner
+# ---------------------------------------------------------------------
+
+# Threshold-hit buckets for the AZ-696 epic's distance gates. Reported
+# in this exact order so report tables stay stable across runs.
+_DEFAULT_THRESHOLDS_M: tuple[float, ...] = (10.0, 25.0, 50.0, 100.0)
+
+
+@dataclass(frozen=True)
+class HorizontalErrorDistribution:
+    """Aggregate horizontal-error stats for one estimator run.
+
+    All distances are great-circle metres (see :func:`l2_horizontal_m`).
+    ``threshold_hit_share`` keys map the buckets in
+    :data:`_DEFAULT_THRESHOLDS_M`; values are share-in-``[0.0, 1.0]``.
+    ``count`` is the number of emissions that contributed (i.e. each
+    emission has at least one ground-truth candidate to compare to).
+
+    The ``vertical_*`` fields are populated when emissions carry an
+    ``alt_m`` component AND the matched ground-truth row carries an
+    altitude. Missing altitudes contribute nothing — both stats and
+    counts skip those samples.
+
+    Style: this DTO is the source of truth for AZ-699's report writer
+    and for downstream HTTP-API / dashboard consumers.
+    """
+
+    count: int
+    horizontal_error_mean_m: float
+    horizontal_error_p50_m: float
+    horizontal_error_p95_m: float
+    horizontal_error_p99_m: float
+    threshold_hit_share: dict[float, float]
+    vertical_count: int
+    vertical_error_mean_m: float
+    vertical_error_p50_m: float
+    vertical_error_p95_m: float
+
+
+def percentile_sorted(values_sorted: list[float], pct: float) -> float:
+    """Linear-interpolation percentile over an already-sorted list.
+
+    ``pct`` is in ``[0.0, 100.0]``. Matches numpy's default
+    ``linear`` interpolation rule so the report numbers are reproducible
+    by anyone with numpy or scipy at hand. Empty input → ``0.0`` so
+    downstream report formatters never trip on a divide-by-zero; the
+    caller is expected to gate on ``count > 0`` for honest reporting.
+    """
+    if not values_sorted:
+        return 0.0
+    if pct <= 0.0:
+        return values_sorted[0]
+    if pct >= 100.0:
+        return values_sorted[-1]
+    n = len(values_sorted)
+    rank = (pct / 100.0) * (n - 1)
+    lo = int(math.floor(rank))
+    hi = int(math.ceil(rank))
+    if lo == hi:
+        return values_sorted[lo]
+    frac = rank - lo
+    return values_sorted[lo] * (1.0 - frac) + values_sorted[hi] * frac
+
+
+def horizontal_error_distribution(
+    emissions: list[dict[str, Any]],
+    ground_truth: list[GroundTruthRow],
+    *,
+    thresholds_m: tuple[float, ...] = _DEFAULT_THRESHOLDS_M,
+) -> HorizontalErrorDistribution:
+    """Walk all emissions once, return aggregated horizontal-error stats.
+
+    Pairs each emission with the nearest-in-time ground-truth row using
+    the same bisect strategy as :func:`match_percentage`. Vertical error
+    is computed when both the emission record and the matched row carry
+    a numeric altitude.
+
+    Empty emissions produce a zero-count distribution (every numeric
+    field is ``0.0``). The caller is responsible for surfacing
+    "no emissions" as a separate fail condition before consuming the
+    report — the helper itself never raises on empty input so report
+    formatting stays defensive.
+    """
+    if not ground_truth:
+        raise AssertionError("ground_truth must be non-empty")
+    if not emissions:
+        return HorizontalErrorDistribution(
+            count=0,
+            horizontal_error_mean_m=0.0,
+            horizontal_error_p50_m=0.0,
+            horizontal_error_p95_m=0.0,
+            horizontal_error_p99_m=0.0,
+            threshold_hit_share={t: 0.0 for t in thresholds_m},
+            vertical_count=0,
+            vertical_error_mean_m=0.0,
+            vertical_error_p50_m=0.0,
+            vertical_error_p95_m=0.0,
+        )
+
+    gt_sorted = sorted(ground_truth, key=lambda r: r.t_s)
+    gt_times = [r.t_s for r in gt_sorted]
+
+    horiz_errors: list[float] = []
+    vert_errors: list[float] = []
+    for emit in emissions:
+        emit_ts_ns = int(emit["emitted_at"])
+        emit_t_s = emit_ts_ns / 1e9
+        idx = _bisect_left(gt_times, emit_t_s)
+        candidates = []
+        if idx > 0:
+            candidates.append(gt_sorted[idx - 1])
+        if idx < len(gt_sorted):
+            candidates.append(gt_sorted[idx])
+        nearest = min(candidates, key=lambda r: abs(r.t_s - emit_t_s))
+        emit_pos = emit["position_wgs84"]
+        horiz_errors.append(
+            l2_horizontal_m(
+                emit_pos["lat_deg"],
+                emit_pos["lon_deg"],
+                nearest.lat_deg,
+                nearest.lon_deg,
+            )
+        )
+        emit_alt = emit_pos.get("alt_m")
+        if emit_alt is not None and not math.isnan(nearest.alt_m):
+            vert_errors.append(abs(float(emit_alt) - nearest.alt_m))
+
+    horiz_sorted = sorted(horiz_errors)
+    horiz_mean = sum(horiz_errors) / len(horiz_errors)
+    thresholds_share = {
+        t: sum(1 for e in horiz_errors if e <= t) / len(horiz_errors)
+        for t in thresholds_m
+    }
+
+    if vert_errors:
+        vert_sorted = sorted(vert_errors)
+        vert_mean = sum(vert_errors) / len(vert_errors)
+        vert_p50 = percentile_sorted(vert_sorted, 50.0)
+        vert_p95 = percentile_sorted(vert_sorted, 95.0)
+    else:
+        vert_mean = 0.0
+        vert_p50 = 0.0
+        vert_p95 = 0.0
+
+    return HorizontalErrorDistribution(
+        count=len(horiz_errors),
+        horizontal_error_mean_m=horiz_mean,
+        horizontal_error_p50_m=percentile_sorted(horiz_sorted, 50.0),
+        horizontal_error_p95_m=percentile_sorted(horiz_sorted, 95.0),
+        horizontal_error_p99_m=percentile_sorted(horiz_sorted, 99.0),
+        threshold_hit_share=thresholds_share,
+        vertical_count=len(vert_errors),
+        vertical_error_mean_m=vert_mean,
+        vertical_error_p50_m=vert_p50,
+        vertical_error_p95_m=vert_p95,
+    )