"""AZ-699 — Report writer + accuracy-distribution unit tests. Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py`` and is skipped without the real video + ``RUN_REPLAY_E2E=1``. Style: every test follows the Arrange / Act / Assert pattern. """ from __future__ import annotations from pathlib import Path import pytest from gps_denied_onboard.helpers.gps_compare import ( GroundTruthRow, HorizontalErrorDistribution, horizontal_error_distribution, percentile_sorted, ) from tests.e2e.replay._report_writer import ( AC3_GATE_PCT, AC3_GATE_THRESHOLD_M, ReportContext, format_failure_message, render_report, verdict_passes_ac3, ) # --------------------------------------------------------------------- # percentile_sorted: pin numpy-equivalent linear interpolation def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None: # Arrange values = [1.0, 2.0, 3.0, 4.0, 5.0] # Act / Assert assert percentile_sorted(values, 0.0) == 1.0 assert percentile_sorted(values, 100.0) == 5.0 assert percentile_sorted(values, 50.0) == 3.0 def test_percentile_sorted_p95_linear_interpolation() -> None: # Arrange values = list(range(0, 101)) # 0..100 # Act p95 = percentile_sorted(values, 95.0) # Assert: numpy's linear interpolation puts p95 at index 95 of # [0..100] → value = 95. assert p95 == pytest.approx(95.0, abs=1e-9) def test_percentile_sorted_empty_returns_zero() -> None: # Act / Assert assert percentile_sorted([], 50.0) == 0.0 # --------------------------------------------------------------------- # horizontal_error_distribution: walk emissions once, return aggregate def test_distribution_zero_error_at_collocated_points() -> None: # Arrange: estimator emissions perfectly match ground-truth rows. gt = [ GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0), GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0), ] emissions = [ { "emitted_at": 0, "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0}, }, { "emitted_at": 1_000_000_000, "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0}, }, ] # Act dist = horizontal_error_distribution(emissions, gt) # Assert assert dist.count == 2 assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6) assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6) for threshold in (10.0, 25.0, 50.0, 100.0): assert dist.threshold_hit_share[threshold] == pytest.approx(1.0) def test_distribution_threshold_hit_share_partials() -> None: # Arrange: 4 emissions at increasing offsets from a single GT # point. ~111 m per 0.001 deg lat near the equator → produce # one each in the < 10, < 25, < 50, < 100 buckets so the # threshold counters are unambiguous. gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] # Distances are dominated by latitude offsets. Use deltas that # bracket the buckets (5 m, 20 m, 40 m, 80 m). emissions = [ { "emitted_at": i, "position_wgs84": { "lat_deg": 50.0 + (offset_m / 111_000.0), "lon_deg": 30.0, "alt_m": 100.0, }, } for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0)) ] # Act dist = horizontal_error_distribution(emissions, gt) # Assert assert dist.count == 4 assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m def test_distribution_vertical_skipped_when_alt_absent() -> None: # Arrange: emissions without ``alt_m`` produce zero vertical # samples; horizontal stats still compute. gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] emissions = [ { "emitted_at": 0, "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0}, } ] # Act dist = horizontal_error_distribution(emissions, gt) # Assert assert dist.vertical_count == 0 assert dist.vertical_error_mean_m == 0.0 assert dist.count == 1 def test_distribution_empty_emissions_returns_zero_dist() -> None: # Arrange gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] # Act dist = horizontal_error_distribution([], gt) # Assert assert dist.count == 0 assert dist.horizontal_error_mean_m == 0.0 for share in dist.threshold_hit_share.values(): assert share == 0.0 def test_distribution_raises_on_empty_ground_truth() -> None: # Arrange emissions = [ { "emitted_at": 0, "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0}, } ] # Act / Assert with pytest.raises(AssertionError): horizontal_error_distribution(emissions, []) # --------------------------------------------------------------------- # Verdict + failure-message: AC-3 honesty def _passing_distribution() -> HorizontalErrorDistribution: return HorizontalErrorDistribution( count=100, horizontal_error_mean_m=20.0, horizontal_error_p50_m=18.0, horizontal_error_p95_m=70.0, horizontal_error_p99_m=95.0, threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92}, vertical_count=0, vertical_error_mean_m=0.0, vertical_error_p50_m=0.0, vertical_error_p95_m=0.0, ) def _failing_distribution() -> HorizontalErrorDistribution: return HorizontalErrorDistribution( count=200, horizontal_error_mean_m=180.0, horizontal_error_p50_m=140.0, horizontal_error_p95_m=420.0, horizontal_error_p99_m=580.0, threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45}, vertical_count=0, vertical_error_mean_m=0.0, vertical_error_p50_m=0.0, vertical_error_p95_m=0.0, ) def _context(method: str = "factory-sheet") -> ReportContext: return ReportContext( run_date_utc="2026-05-20", tlog_path=Path("/tmp/derkachi.tlog"), video_path=Path("/tmp/flight_derkachi.mp4"), calibration_acquisition_method=method, clip_duration_s=180.0, emissions_count=200, ) def test_verdict_passes_when_at_least_80pct_within_100m() -> None: # Act / Assert assert verdict_passes_ac3(_passing_distribution()) is True assert verdict_passes_ac3(_failing_distribution()) is False def test_verdict_fails_on_zero_count() -> None: # Arrange empty = HorizontalErrorDistribution( count=0, horizontal_error_mean_m=0.0, horizontal_error_p50_m=0.0, horizontal_error_p95_m=0.0, horizontal_error_p99_m=0.0, threshold_hit_share={100.0: 0.0}, vertical_count=0, vertical_error_mean_m=0.0, vertical_error_p50_m=0.0, vertical_error_p95_m=0.0, ) # Act / Assert assert verdict_passes_ac3(empty) is False def test_failure_message_references_calibration_method_factory_sheet() -> None: # Act msg = format_failure_message( _failing_distribution(), _context(method="factory-sheet"), ) # Assert: AZ-699 AC-3 — failure message attributes residual to the # calibration acquisition method and surfaces the budget. assert "AZ-699 AC-3" in msg assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0 assert "100 m" in msg assert f"{AC3_GATE_PCT:.0f} %" in msg assert "Calibration: factory-sheet" in msg assert "mean=180.0 m" in msg assert "p95=420.0 m" in msg assert "p99=580.0 m" in msg def test_failure_message_references_calibration_method_placeholder() -> None: # Act msg = format_failure_message( _failing_distribution(), _context(method="placeholder"), ) # Assert assert "Calibration: placeholder" in msg # --------------------------------------------------------------------- # Render report: AC-2 — every required row present def test_render_report_contains_all_required_rows_on_pass() -> None: # Act text = render_report( _passing_distribution(), _context(), passed=True ) # Assert assert text.startswith("# Real-flight validation — 2026-05-20") assert "**Verdict**: PASS" in text assert "## Horizontal error (metres)" in text assert "## Threshold-hit share" in text assert "Calibration acquisition method: factory-sheet" in text for stat in ("Mean", "p50", "p95", "p99"): assert f"| {stat} |" in text for threshold in (10, 25, 50, 100): assert f"| {threshold} |" in text def test_render_report_marks_failure_when_below_gate() -> None: # Act text = render_report( _failing_distribution(), _context(), passed=False ) # Assert assert "**Verdict**: FAIL" in text # Threshold row for 100 m must show ~ 45 % (failing distribution). assert "| 100 | 45.0 |" in text def test_render_report_includes_vertical_when_available() -> None: # Arrange dist = HorizontalErrorDistribution( count=10, horizontal_error_mean_m=10.0, horizontal_error_p50_m=10.0, horizontal_error_p95_m=10.0, horizontal_error_p99_m=10.0, threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0}, vertical_count=10, vertical_error_mean_m=4.5, vertical_error_p50_m=4.0, vertical_error_p95_m=8.5, ) # Act text = render_report(dist, _context(), passed=True) # Assert assert "## Vertical error (metres)" in text assert "| Mean | 4.50 |" in text assert "| p95 | 8.50 |" in text assert "| Samples | 10 |" in text def test_render_report_marks_vertical_skipped_when_no_samples() -> None: # Arrange dist = _passing_distribution() # vertical_count == 0 # Act text = render_report(dist, _context(), passed=True) # Assert assert "_No emissions carried a comparable altitude" in text