Files
gps-denied-onboard/tests/unit/test_az699_report_writer.py
T
Oleksandr Bezdieniezhnykh dcde602f61 [AZ-699] Real-flight validation runner + Markdown accuracy report
New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).

Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.

16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-20 16:53:48 +03:00

343 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""AZ-699 — Report writer + accuracy-distribution unit tests.
Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message
templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi
inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py``
and is skipped without the real video + ``RUN_REPLAY_E2E=1``.
Style: every test follows the Arrange / Act / Assert pattern.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
HorizontalErrorDistribution,
horizontal_error_distribution,
percentile_sorted,
)
from tests.e2e.replay._report_writer import (
AC3_GATE_PCT,
AC3_GATE_THRESHOLD_M,
ReportContext,
format_failure_message,
render_report,
verdict_passes_ac3,
)
# ---------------------------------------------------------------------
# percentile_sorted: pin numpy-equivalent linear interpolation
def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None:
# Arrange
values = [1.0, 2.0, 3.0, 4.0, 5.0]
# Act / Assert
assert percentile_sorted(values, 0.0) == 1.0
assert percentile_sorted(values, 100.0) == 5.0
assert percentile_sorted(values, 50.0) == 3.0
def test_percentile_sorted_p95_linear_interpolation() -> None:
# Arrange
values = list(range(0, 101)) # 0..100
# Act
p95 = percentile_sorted(values, 95.0)
# Assert: numpy's linear interpolation puts p95 at index 95 of
# [0..100] → value = 95.
assert p95 == pytest.approx(95.0, abs=1e-9)
def test_percentile_sorted_empty_returns_zero() -> None:
# Act / Assert
assert percentile_sorted([], 50.0) == 0.0
# ---------------------------------------------------------------------
# horizontal_error_distribution: walk emissions once, return aggregate
def test_distribution_zero_error_at_collocated_points() -> None:
# Arrange: estimator emissions perfectly match ground-truth rows.
gt = [
GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
{
"emitted_at": 1_000_000_000,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 2
assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6)
assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6)
for threshold in (10.0, 25.0, 50.0, 100.0):
assert dist.threshold_hit_share[threshold] == pytest.approx(1.0)
def test_distribution_threshold_hit_share_partials() -> None:
# Arrange: 4 emissions at increasing offsets from a single GT
# point. ~111 m per 0.001 deg lat near the equator → produce
# one each in the < 10, < 25, < 50, < 100 buckets so the
# threshold counters are unambiguous.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Distances are dominated by latitude offsets. Use deltas that
# bracket the buckets (5 m, 20 m, 40 m, 80 m).
emissions = [
{
"emitted_at": i,
"position_wgs84": {
"lat_deg": 50.0 + (offset_m / 111_000.0),
"lon_deg": 30.0,
"alt_m": 100.0,
},
}
for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0))
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 4
assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m
assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m
assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m
assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m
def test_distribution_vertical_skipped_when_alt_absent() -> None:
# Arrange: emissions without ``alt_m`` produce zero vertical
# samples; horizontal stats still compute.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.vertical_count == 0
assert dist.vertical_error_mean_m == 0.0
assert dist.count == 1
def test_distribution_empty_emissions_returns_zero_dist() -> None:
# Arrange
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Act
dist = horizontal_error_distribution([], gt)
# Assert
assert dist.count == 0
assert dist.horizontal_error_mean_m == 0.0
for share in dist.threshold_hit_share.values():
assert share == 0.0
def test_distribution_raises_on_empty_ground_truth() -> None:
# Arrange
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act / Assert
with pytest.raises(AssertionError):
horizontal_error_distribution(emissions, [])
# ---------------------------------------------------------------------
# Verdict + failure-message: AC-3 honesty
def _passing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=100,
horizontal_error_mean_m=20.0,
horizontal_error_p50_m=18.0,
horizontal_error_p95_m=70.0,
horizontal_error_p99_m=95.0,
threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _failing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=200,
horizontal_error_mean_m=180.0,
horizontal_error_p50_m=140.0,
horizontal_error_p95_m=420.0,
horizontal_error_p99_m=580.0,
threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _context(method: str = "factory-sheet") -> ReportContext:
return ReportContext(
run_date_utc="2026-05-20",
tlog_path=Path("/tmp/derkachi.tlog"),
video_path=Path("/tmp/flight_derkachi.mp4"),
calibration_acquisition_method=method,
clip_duration_s=180.0,
emissions_count=200,
)
def test_verdict_passes_when_at_least_80pct_within_100m() -> None:
# Act / Assert
assert verdict_passes_ac3(_passing_distribution()) is True
assert verdict_passes_ac3(_failing_distribution()) is False
def test_verdict_fails_on_zero_count() -> None:
# Arrange
empty = HorizontalErrorDistribution(
count=0,
horizontal_error_mean_m=0.0,
horizontal_error_p50_m=0.0,
horizontal_error_p95_m=0.0,
horizontal_error_p99_m=0.0,
threshold_hit_share={100.0: 0.0},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
# Act / Assert
assert verdict_passes_ac3(empty) is False
def test_failure_message_references_calibration_method_factory_sheet() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="factory-sheet"),
)
# Assert: AZ-699 AC-3 — failure message attributes residual to the
# calibration acquisition method and surfaces the budget.
assert "AZ-699 AC-3" in msg
assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0
assert "100 m" in msg
assert f"{AC3_GATE_PCT:.0f} %" in msg
assert "Calibration: factory-sheet" in msg
assert "mean=180.0 m" in msg
assert "p95=420.0 m" in msg
assert "p99=580.0 m" in msg
def test_failure_message_references_calibration_method_placeholder() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="placeholder"),
)
# Assert
assert "Calibration: placeholder" in msg
# ---------------------------------------------------------------------
# Render report: AC-2 — every required row present
def test_render_report_contains_all_required_rows_on_pass() -> None:
# Act
text = render_report(
_passing_distribution(), _context(), passed=True
)
# Assert
assert text.startswith("# Real-flight validation — 2026-05-20")
assert "**Verdict**: PASS" in text
assert "## Horizontal error (metres)" in text
assert "## Threshold-hit share" in text
assert "Calibration acquisition method: factory-sheet" in text
for stat in ("Mean", "p50", "p95", "p99"):
assert f"| {stat} |" in text
for threshold in (10, 25, 50, 100):
assert f"| {threshold} |" in text
def test_render_report_marks_failure_when_below_gate() -> None:
# Act
text = render_report(
_failing_distribution(), _context(), passed=False
)
# Assert
assert "**Verdict**: FAIL" in text
# Threshold row for 100 m must show ~ 45 % (failing distribution).
assert "| 100 | 45.0 |" in text
def test_render_report_includes_vertical_when_available() -> None:
# Arrange
dist = HorizontalErrorDistribution(
count=10,
horizontal_error_mean_m=10.0,
horizontal_error_p50_m=10.0,
horizontal_error_p95_m=10.0,
horizontal_error_p99_m=10.0,
threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0},
vertical_count=10,
vertical_error_mean_m=4.5,
vertical_error_p50_m=4.0,
vertical_error_p95_m=8.5,
)
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "## Vertical error (metres)" in text
assert "| Mean | 4.50 |" in text
assert "| p95 | 8.50 |" in text
assert "| Samples | 10 |" in text
def test_render_report_marks_vertical_skipped_when_no_samples() -> None:
# Arrange
dist = _passing_distribution() # vertical_count == 0
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "_No emissions carried a comparable altitude" in text