mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 07:01:14 +00:00
dcde602f61
New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).
Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.
16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.
Co-authored-by: Cursor <cursoragent@cursor.com>
343 lines
10 KiB
Python
343 lines
10 KiB
Python
"""AZ-699 — Report writer + accuracy-distribution unit tests.
|
||
|
||
Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message
|
||
templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi
|
||
inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py``
|
||
and is skipped without the real video + ``RUN_REPLAY_E2E=1``.
|
||
|
||
Style: every test follows the Arrange / Act / Assert pattern.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
from gps_denied_onboard.helpers.gps_compare import (
|
||
GroundTruthRow,
|
||
HorizontalErrorDistribution,
|
||
horizontal_error_distribution,
|
||
percentile_sorted,
|
||
)
|
||
from tests.e2e.replay._report_writer import (
|
||
AC3_GATE_PCT,
|
||
AC3_GATE_THRESHOLD_M,
|
||
ReportContext,
|
||
format_failure_message,
|
||
render_report,
|
||
verdict_passes_ac3,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# percentile_sorted: pin numpy-equivalent linear interpolation
|
||
|
||
|
||
def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None:
|
||
# Arrange
|
||
values = [1.0, 2.0, 3.0, 4.0, 5.0]
|
||
|
||
# Act / Assert
|
||
assert percentile_sorted(values, 0.0) == 1.0
|
||
assert percentile_sorted(values, 100.0) == 5.0
|
||
assert percentile_sorted(values, 50.0) == 3.0
|
||
|
||
|
||
def test_percentile_sorted_p95_linear_interpolation() -> None:
|
||
# Arrange
|
||
values = list(range(0, 101)) # 0..100
|
||
|
||
# Act
|
||
p95 = percentile_sorted(values, 95.0)
|
||
|
||
# Assert: numpy's linear interpolation puts p95 at index 95 of
|
||
# [0..100] → value = 95.
|
||
assert p95 == pytest.approx(95.0, abs=1e-9)
|
||
|
||
|
||
def test_percentile_sorted_empty_returns_zero() -> None:
|
||
# Act / Assert
|
||
assert percentile_sorted([], 50.0) == 0.0
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# horizontal_error_distribution: walk emissions once, return aggregate
|
||
|
||
|
||
def test_distribution_zero_error_at_collocated_points() -> None:
|
||
# Arrange: estimator emissions perfectly match ground-truth rows.
|
||
gt = [
|
||
GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
|
||
GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
|
||
]
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
|
||
},
|
||
{
|
||
"emitted_at": 1_000_000_000,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
|
||
},
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.count == 2
|
||
assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6)
|
||
assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6)
|
||
for threshold in (10.0, 25.0, 50.0, 100.0):
|
||
assert dist.threshold_hit_share[threshold] == pytest.approx(1.0)
|
||
|
||
|
||
def test_distribution_threshold_hit_share_partials() -> None:
|
||
# Arrange: 4 emissions at increasing offsets from a single GT
|
||
# point. ~111 m per 0.001 deg lat near the equator → produce
|
||
# one each in the < 10, < 25, < 50, < 100 buckets so the
|
||
# threshold counters are unambiguous.
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
# Distances are dominated by latitude offsets. Use deltas that
|
||
# bracket the buckets (5 m, 20 m, 40 m, 80 m).
|
||
emissions = [
|
||
{
|
||
"emitted_at": i,
|
||
"position_wgs84": {
|
||
"lat_deg": 50.0 + (offset_m / 111_000.0),
|
||
"lon_deg": 30.0,
|
||
"alt_m": 100.0,
|
||
},
|
||
}
|
||
for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0))
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.count == 4
|
||
assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m
|
||
assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m
|
||
assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m
|
||
assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m
|
||
|
||
|
||
def test_distribution_vertical_skipped_when_alt_absent() -> None:
|
||
# Arrange: emissions without ``alt_m`` produce zero vertical
|
||
# samples; horizontal stats still compute.
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
|
||
}
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.vertical_count == 0
|
||
assert dist.vertical_error_mean_m == 0.0
|
||
assert dist.count == 1
|
||
|
||
|
||
def test_distribution_empty_emissions_returns_zero_dist() -> None:
|
||
# Arrange
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution([], gt)
|
||
|
||
# Assert
|
||
assert dist.count == 0
|
||
assert dist.horizontal_error_mean_m == 0.0
|
||
for share in dist.threshold_hit_share.values():
|
||
assert share == 0.0
|
||
|
||
|
||
def test_distribution_raises_on_empty_ground_truth() -> None:
|
||
# Arrange
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
|
||
}
|
||
]
|
||
|
||
# Act / Assert
|
||
with pytest.raises(AssertionError):
|
||
horizontal_error_distribution(emissions, [])
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# Verdict + failure-message: AC-3 honesty
|
||
|
||
|
||
def _passing_distribution() -> HorizontalErrorDistribution:
|
||
return HorizontalErrorDistribution(
|
||
count=100,
|
||
horizontal_error_mean_m=20.0,
|
||
horizontal_error_p50_m=18.0,
|
||
horizontal_error_p95_m=70.0,
|
||
horizontal_error_p99_m=95.0,
|
||
threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
|
||
def _failing_distribution() -> HorizontalErrorDistribution:
|
||
return HorizontalErrorDistribution(
|
||
count=200,
|
||
horizontal_error_mean_m=180.0,
|
||
horizontal_error_p50_m=140.0,
|
||
horizontal_error_p95_m=420.0,
|
||
horizontal_error_p99_m=580.0,
|
||
threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
|
||
def _context(method: str = "factory-sheet") -> ReportContext:
|
||
return ReportContext(
|
||
run_date_utc="2026-05-20",
|
||
tlog_path=Path("/tmp/derkachi.tlog"),
|
||
video_path=Path("/tmp/flight_derkachi.mp4"),
|
||
calibration_acquisition_method=method,
|
||
clip_duration_s=180.0,
|
||
emissions_count=200,
|
||
)
|
||
|
||
|
||
def test_verdict_passes_when_at_least_80pct_within_100m() -> None:
|
||
# Act / Assert
|
||
assert verdict_passes_ac3(_passing_distribution()) is True
|
||
assert verdict_passes_ac3(_failing_distribution()) is False
|
||
|
||
|
||
def test_verdict_fails_on_zero_count() -> None:
|
||
# Arrange
|
||
empty = HorizontalErrorDistribution(
|
||
count=0,
|
||
horizontal_error_mean_m=0.0,
|
||
horizontal_error_p50_m=0.0,
|
||
horizontal_error_p95_m=0.0,
|
||
horizontal_error_p99_m=0.0,
|
||
threshold_hit_share={100.0: 0.0},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
# Act / Assert
|
||
assert verdict_passes_ac3(empty) is False
|
||
|
||
|
||
def test_failure_message_references_calibration_method_factory_sheet() -> None:
|
||
# Act
|
||
msg = format_failure_message(
|
||
_failing_distribution(),
|
||
_context(method="factory-sheet"),
|
||
)
|
||
|
||
# Assert: AZ-699 AC-3 — failure message attributes residual to the
|
||
# calibration acquisition method and surfaces the budget.
|
||
assert "AZ-699 AC-3" in msg
|
||
assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0
|
||
assert "100 m" in msg
|
||
assert f"{AC3_GATE_PCT:.0f} %" in msg
|
||
assert "Calibration: factory-sheet" in msg
|
||
assert "mean=180.0 m" in msg
|
||
assert "p95=420.0 m" in msg
|
||
assert "p99=580.0 m" in msg
|
||
|
||
|
||
def test_failure_message_references_calibration_method_placeholder() -> None:
|
||
# Act
|
||
msg = format_failure_message(
|
||
_failing_distribution(),
|
||
_context(method="placeholder"),
|
||
)
|
||
|
||
# Assert
|
||
assert "Calibration: placeholder" in msg
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# Render report: AC-2 — every required row present
|
||
|
||
|
||
def test_render_report_contains_all_required_rows_on_pass() -> None:
|
||
# Act
|
||
text = render_report(
|
||
_passing_distribution(), _context(), passed=True
|
||
)
|
||
|
||
# Assert
|
||
assert text.startswith("# Real-flight validation — 2026-05-20")
|
||
assert "**Verdict**: PASS" in text
|
||
assert "## Horizontal error (metres)" in text
|
||
assert "## Threshold-hit share" in text
|
||
assert "Calibration acquisition method: factory-sheet" in text
|
||
for stat in ("Mean", "p50", "p95", "p99"):
|
||
assert f"| {stat} |" in text
|
||
for threshold in (10, 25, 50, 100):
|
||
assert f"| {threshold} |" in text
|
||
|
||
|
||
def test_render_report_marks_failure_when_below_gate() -> None:
|
||
# Act
|
||
text = render_report(
|
||
_failing_distribution(), _context(), passed=False
|
||
)
|
||
|
||
# Assert
|
||
assert "**Verdict**: FAIL" in text
|
||
# Threshold row for 100 m must show ~ 45 % (failing distribution).
|
||
assert "| 100 | 45.0 |" in text
|
||
|
||
|
||
def test_render_report_includes_vertical_when_available() -> None:
|
||
# Arrange
|
||
dist = HorizontalErrorDistribution(
|
||
count=10,
|
||
horizontal_error_mean_m=10.0,
|
||
horizontal_error_p50_m=10.0,
|
||
horizontal_error_p95_m=10.0,
|
||
horizontal_error_p99_m=10.0,
|
||
threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0},
|
||
vertical_count=10,
|
||
vertical_error_mean_m=4.5,
|
||
vertical_error_p50_m=4.0,
|
||
vertical_error_p95_m=8.5,
|
||
)
|
||
|
||
# Act
|
||
text = render_report(dist, _context(), passed=True)
|
||
|
||
# Assert
|
||
assert "## Vertical error (metres)" in text
|
||
assert "| Mean | 4.50 |" in text
|
||
assert "| p95 | 8.50 |" in text
|
||
assert "| Samples | 10 |" in text
|
||
|
||
|
||
def test_render_report_marks_vertical_skipped_when_no_samples() -> None:
|
||
# Arrange
|
||
dist = _passing_distribution() # vertical_count == 0
|
||
|
||
# Act
|
||
text = render_report(dist, _context(), passed=True)
|
||
|
||
# Assert
|
||
assert "_No emissions carried a comparable altitude" in text
|