mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-22 18:41:13 +00:00
[AZ-699] Real-flight validation runner + Markdown accuracy report
New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).
Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.
16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -18,8 +18,11 @@ from gps_denied_onboard.helpers.engine_filename_schema import (
|
||||
)
|
||||
from gps_denied_onboard.helpers.gps_compare import (
|
||||
GroundTruthRow,
|
||||
HorizontalErrorDistribution,
|
||||
horizontal_error_distribution,
|
||||
l2_horizontal_m,
|
||||
match_percentage,
|
||||
percentile_sorted,
|
||||
)
|
||||
from gps_denied_onboard.helpers.imu_preintegrator import (
|
||||
CombinedImuFactor,
|
||||
@@ -77,6 +80,7 @@ __all__ = [
|
||||
"EngineFilenameSchema",
|
||||
"EngineFilenameSchemaError",
|
||||
"GroundTruthRow",
|
||||
"HorizontalErrorDistribution",
|
||||
"ImuPreintegrationError",
|
||||
"ImuPreintegrator",
|
||||
"LightGlueConcurrentAccessError",
|
||||
@@ -92,6 +96,7 @@ __all__ = [
|
||||
"WgsConverter",
|
||||
"adjoint",
|
||||
"exp_map",
|
||||
"horizontal_error_distribution",
|
||||
"is_valid_rotation",
|
||||
"iso_ts_from_clock",
|
||||
"iso_ts_now",
|
||||
@@ -100,5 +105,6 @@ __all__ = [
|
||||
"match_percentage",
|
||||
"make_imu_preintegrator",
|
||||
"matrix_to_se3",
|
||||
"percentile_sorted",
|
||||
"se3_to_matrix",
|
||||
]
|
||||
|
||||
@@ -19,8 +19,11 @@ from typing import Any
|
||||
|
||||
__all__ = [
|
||||
"GroundTruthRow",
|
||||
"HorizontalErrorDistribution",
|
||||
"horizontal_error_distribution",
|
||||
"l2_horizontal_m",
|
||||
"match_percentage",
|
||||
"percentile_sorted",
|
||||
]
|
||||
|
||||
|
||||
@@ -118,3 +121,162 @@ def _bisect_left(seq: list[float], target: float) -> int:
|
||||
else:
|
||||
hi = mid
|
||||
return lo
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# AZ-699: accuracy distribution helpers for the real-flight runner
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
# Threshold-hit buckets for the AZ-696 epic's distance gates. Reported
|
||||
# in this exact order so report tables stay stable across runs.
|
||||
_DEFAULT_THRESHOLDS_M: tuple[float, ...] = (10.0, 25.0, 50.0, 100.0)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HorizontalErrorDistribution:
|
||||
"""Aggregate horizontal-error stats for one estimator run.
|
||||
|
||||
All distances are great-circle metres (see :func:`l2_horizontal_m`).
|
||||
``threshold_hit_share`` keys map the buckets in
|
||||
:data:`_DEFAULT_THRESHOLDS_M`; values are share-in-``[0.0, 1.0]``.
|
||||
``count`` is the number of emissions that contributed (i.e. each
|
||||
emission has at least one ground-truth candidate to compare to).
|
||||
|
||||
The ``vertical_*`` fields are populated when emissions carry an
|
||||
``alt_m`` component AND the matched ground-truth row carries an
|
||||
altitude. Missing altitudes contribute nothing — both stats and
|
||||
counts skip those samples.
|
||||
|
||||
Style: this DTO is the source of truth for AZ-699's report writer
|
||||
and for downstream HTTP-API / dashboard consumers.
|
||||
"""
|
||||
|
||||
count: int
|
||||
horizontal_error_mean_m: float
|
||||
horizontal_error_p50_m: float
|
||||
horizontal_error_p95_m: float
|
||||
horizontal_error_p99_m: float
|
||||
threshold_hit_share: dict[float, float]
|
||||
vertical_count: int
|
||||
vertical_error_mean_m: float
|
||||
vertical_error_p50_m: float
|
||||
vertical_error_p95_m: float
|
||||
|
||||
|
||||
def percentile_sorted(values_sorted: list[float], pct: float) -> float:
|
||||
"""Linear-interpolation percentile over an already-sorted list.
|
||||
|
||||
``pct`` is in ``[0.0, 100.0]``. Matches numpy's default
|
||||
``linear`` interpolation rule so the report numbers are reproducible
|
||||
by anyone with numpy or scipy at hand. Empty input → ``0.0`` so
|
||||
downstream report formatters never trip on a divide-by-zero; the
|
||||
caller is expected to gate on ``count > 0`` for honest reporting.
|
||||
"""
|
||||
if not values_sorted:
|
||||
return 0.0
|
||||
if pct <= 0.0:
|
||||
return values_sorted[0]
|
||||
if pct >= 100.0:
|
||||
return values_sorted[-1]
|
||||
n = len(values_sorted)
|
||||
rank = (pct / 100.0) * (n - 1)
|
||||
lo = int(math.floor(rank))
|
||||
hi = int(math.ceil(rank))
|
||||
if lo == hi:
|
||||
return values_sorted[lo]
|
||||
frac = rank - lo
|
||||
return values_sorted[lo] * (1.0 - frac) + values_sorted[hi] * frac
|
||||
|
||||
|
||||
def horizontal_error_distribution(
|
||||
emissions: list[dict[str, Any]],
|
||||
ground_truth: list[GroundTruthRow],
|
||||
*,
|
||||
thresholds_m: tuple[float, ...] = _DEFAULT_THRESHOLDS_M,
|
||||
) -> HorizontalErrorDistribution:
|
||||
"""Walk all emissions once, return aggregated horizontal-error stats.
|
||||
|
||||
Pairs each emission with the nearest-in-time ground-truth row using
|
||||
the same bisect strategy as :func:`match_percentage`. Vertical error
|
||||
is computed when both the emission record and the matched row carry
|
||||
a numeric altitude.
|
||||
|
||||
Empty emissions produce a zero-count distribution (every numeric
|
||||
field is ``0.0``). The caller is responsible for surfacing
|
||||
"no emissions" as a separate fail condition before consuming the
|
||||
report — the helper itself never raises on empty input so report
|
||||
formatting stays defensive.
|
||||
"""
|
||||
if not ground_truth:
|
||||
raise AssertionError("ground_truth must be non-empty")
|
||||
if not emissions:
|
||||
return HorizontalErrorDistribution(
|
||||
count=0,
|
||||
horizontal_error_mean_m=0.0,
|
||||
horizontal_error_p50_m=0.0,
|
||||
horizontal_error_p95_m=0.0,
|
||||
horizontal_error_p99_m=0.0,
|
||||
threshold_hit_share={t: 0.0 for t in thresholds_m},
|
||||
vertical_count=0,
|
||||
vertical_error_mean_m=0.0,
|
||||
vertical_error_p50_m=0.0,
|
||||
vertical_error_p95_m=0.0,
|
||||
)
|
||||
|
||||
gt_sorted = sorted(ground_truth, key=lambda r: r.t_s)
|
||||
gt_times = [r.t_s for r in gt_sorted]
|
||||
|
||||
horiz_errors: list[float] = []
|
||||
vert_errors: list[float] = []
|
||||
for emit in emissions:
|
||||
emit_ts_ns = int(emit["emitted_at"])
|
||||
emit_t_s = emit_ts_ns / 1e9
|
||||
idx = _bisect_left(gt_times, emit_t_s)
|
||||
candidates = []
|
||||
if idx > 0:
|
||||
candidates.append(gt_sorted[idx - 1])
|
||||
if idx < len(gt_sorted):
|
||||
candidates.append(gt_sorted[idx])
|
||||
nearest = min(candidates, key=lambda r: abs(r.t_s - emit_t_s))
|
||||
emit_pos = emit["position_wgs84"]
|
||||
horiz_errors.append(
|
||||
l2_horizontal_m(
|
||||
emit_pos["lat_deg"],
|
||||
emit_pos["lon_deg"],
|
||||
nearest.lat_deg,
|
||||
nearest.lon_deg,
|
||||
)
|
||||
)
|
||||
emit_alt = emit_pos.get("alt_m")
|
||||
if emit_alt is not None and not math.isnan(nearest.alt_m):
|
||||
vert_errors.append(abs(float(emit_alt) - nearest.alt_m))
|
||||
|
||||
horiz_sorted = sorted(horiz_errors)
|
||||
horiz_mean = sum(horiz_errors) / len(horiz_errors)
|
||||
thresholds_share = {
|
||||
t: sum(1 for e in horiz_errors if e <= t) / len(horiz_errors)
|
||||
for t in thresholds_m
|
||||
}
|
||||
|
||||
if vert_errors:
|
||||
vert_sorted = sorted(vert_errors)
|
||||
vert_mean = sum(vert_errors) / len(vert_errors)
|
||||
vert_p50 = percentile_sorted(vert_sorted, 50.0)
|
||||
vert_p95 = percentile_sorted(vert_sorted, 95.0)
|
||||
else:
|
||||
vert_mean = 0.0
|
||||
vert_p50 = 0.0
|
||||
vert_p95 = 0.0
|
||||
|
||||
return HorizontalErrorDistribution(
|
||||
count=len(horiz_errors),
|
||||
horizontal_error_mean_m=horiz_mean,
|
||||
horizontal_error_p50_m=percentile_sorted(horiz_sorted, 50.0),
|
||||
horizontal_error_p95_m=percentile_sorted(horiz_sorted, 95.0),
|
||||
horizontal_error_p99_m=percentile_sorted(horiz_sorted, 99.0),
|
||||
threshold_hit_share=thresholds_share,
|
||||
vertical_count=len(vert_errors),
|
||||
vertical_error_mean_m=vert_mean,
|
||||
vertical_error_p50_m=vert_p50,
|
||||
vertical_error_p95_m=vert_p95,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user