mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 10:21:13 +00:00
a3dc8e2636
ReportContext.tlog_path was widened in-place by AZ-959 to mean "ground-truth source path" without renaming, leaving the rendered report's "- Tlog: <csv_path>" line cosmetically wrong for CSV runs. This rename + label fix completes the cleanup. - helpers/accuracy_report.py: field rename + docstring update + rendered line now reads "- Ground truth: <path>" for both inputs. - replay_api/app.py: kwarg updated, AZ-959 inline comment about the overload removed (field name now carries the intent). - tests/unit/test_az699_report_writer.py: fixture updated, two new symmetric tests assert the canonical label for tlog AND csv inputs (AC-2). - tests/e2e/replay/_e2e_orchestrator.py + test_derkachi_real_tlog.py: kwarg updated. Tests: 62/62 green across test_az699_report_writer.py, test_az700_render_map.py, test_az701_replay_api.py. CSV-replay-input chain (AZ-959 + AZ-960 + AZ-961) is now coherent: - API accepts (video, csv) with XOR validation - /static/example-csv serves the AZ-896 reference doc - Runner dispatches --imu vs --tlog argv - Report renders with source-agnostic "Ground truth:" label - Map renders from CSV truth via gps-denied-render-map dispatch Bookkeeping: AZ-961 spec moved todo/ → done/, dep-table preamble eighth bump documents the rename + summarises the cycle-4 CSV chain, state.md records batch 7 complete. Co-authored-by: Cursor <cursoragent@cursor.com>
374 lines
11 KiB
Python
374 lines
11 KiB
Python
"""AZ-699 — Report writer + accuracy-distribution unit tests.
|
||
|
||
Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message
|
||
templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi
|
||
inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py``
|
||
and is skipped without the real video + ``RUN_REPLAY_E2E=1``.
|
||
|
||
Style: every test follows the Arrange / Act / Assert pattern.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
from gps_denied_onboard.helpers.gps_compare import (
|
||
GroundTruthRow,
|
||
HorizontalErrorDistribution,
|
||
horizontal_error_distribution,
|
||
percentile_sorted,
|
||
)
|
||
from gps_denied_onboard.helpers.accuracy_report import (
|
||
AC3_GATE_PCT,
|
||
AC3_GATE_THRESHOLD_M,
|
||
ReportContext,
|
||
format_failure_message,
|
||
render_report,
|
||
verdict_passes_ac3,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# percentile_sorted: pin numpy-equivalent linear interpolation
|
||
|
||
|
||
def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None:
|
||
# Arrange
|
||
values = [1.0, 2.0, 3.0, 4.0, 5.0]
|
||
|
||
# Act / Assert
|
||
assert percentile_sorted(values, 0.0) == 1.0
|
||
assert percentile_sorted(values, 100.0) == 5.0
|
||
assert percentile_sorted(values, 50.0) == 3.0
|
||
|
||
|
||
def test_percentile_sorted_p95_linear_interpolation() -> None:
|
||
# Arrange
|
||
values = list(range(0, 101)) # 0..100
|
||
|
||
# Act
|
||
p95 = percentile_sorted(values, 95.0)
|
||
|
||
# Assert: numpy's linear interpolation puts p95 at index 95 of
|
||
# [0..100] → value = 95.
|
||
assert p95 == pytest.approx(95.0, abs=1e-9)
|
||
|
||
|
||
def test_percentile_sorted_empty_returns_zero() -> None:
|
||
# Act / Assert
|
||
assert percentile_sorted([], 50.0) == 0.0
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# horizontal_error_distribution: walk emissions once, return aggregate
|
||
|
||
|
||
def test_distribution_zero_error_at_collocated_points() -> None:
|
||
# Arrange: estimator emissions perfectly match ground-truth rows.
|
||
gt = [
|
||
GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
|
||
GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
|
||
]
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
|
||
},
|
||
{
|
||
"emitted_at": 1_000_000_000,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
|
||
},
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.count == 2
|
||
assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6)
|
||
assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6)
|
||
for threshold in (10.0, 25.0, 50.0, 100.0):
|
||
assert dist.threshold_hit_share[threshold] == pytest.approx(1.0)
|
||
|
||
|
||
def test_distribution_threshold_hit_share_partials() -> None:
|
||
# Arrange: 4 emissions at increasing offsets from a single GT
|
||
# point. ~111 m per 0.001 deg lat near the equator → produce
|
||
# one each in the < 10, < 25, < 50, < 100 buckets so the
|
||
# threshold counters are unambiguous.
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
# Distances are dominated by latitude offsets. Use deltas that
|
||
# bracket the buckets (5 m, 20 m, 40 m, 80 m).
|
||
emissions = [
|
||
{
|
||
"emitted_at": i,
|
||
"position_wgs84": {
|
||
"lat_deg": 50.0 + (offset_m / 111_000.0),
|
||
"lon_deg": 30.0,
|
||
"alt_m": 100.0,
|
||
},
|
||
}
|
||
for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0))
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.count == 4
|
||
assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m
|
||
assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m
|
||
assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m
|
||
assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m
|
||
|
||
|
||
def test_distribution_vertical_skipped_when_alt_absent() -> None:
|
||
# Arrange: emissions without ``alt_m`` produce zero vertical
|
||
# samples; horizontal stats still compute.
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
|
||
}
|
||
]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution(emissions, gt)
|
||
|
||
# Assert
|
||
assert dist.vertical_count == 0
|
||
assert dist.vertical_error_mean_m == 0.0
|
||
assert dist.count == 1
|
||
|
||
|
||
def test_distribution_empty_emissions_returns_zero_dist() -> None:
|
||
# Arrange
|
||
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
|
||
|
||
# Act
|
||
dist = horizontal_error_distribution([], gt)
|
||
|
||
# Assert
|
||
assert dist.count == 0
|
||
assert dist.horizontal_error_mean_m == 0.0
|
||
for share in dist.threshold_hit_share.values():
|
||
assert share == 0.0
|
||
|
||
|
||
def test_distribution_raises_on_empty_ground_truth() -> None:
|
||
# Arrange
|
||
emissions = [
|
||
{
|
||
"emitted_at": 0,
|
||
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
|
||
}
|
||
]
|
||
|
||
# Act / Assert
|
||
with pytest.raises(AssertionError):
|
||
horizontal_error_distribution(emissions, [])
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# Verdict + failure-message: AC-3 honesty
|
||
|
||
|
||
def _passing_distribution() -> HorizontalErrorDistribution:
|
||
return HorizontalErrorDistribution(
|
||
count=100,
|
||
horizontal_error_mean_m=20.0,
|
||
horizontal_error_p50_m=18.0,
|
||
horizontal_error_p95_m=70.0,
|
||
horizontal_error_p99_m=95.0,
|
||
threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
|
||
def _failing_distribution() -> HorizontalErrorDistribution:
|
||
return HorizontalErrorDistribution(
|
||
count=200,
|
||
horizontal_error_mean_m=180.0,
|
||
horizontal_error_p50_m=140.0,
|
||
horizontal_error_p95_m=420.0,
|
||
horizontal_error_p99_m=580.0,
|
||
threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
|
||
def _context(method: str = "factory-sheet") -> ReportContext:
|
||
return ReportContext(
|
||
run_date_utc="2026-05-20",
|
||
ground_truth_path=Path("/tmp/derkachi.tlog"),
|
||
video_path=Path("/tmp/flight_derkachi.mp4"),
|
||
calibration_acquisition_method=method,
|
||
clip_duration_s=180.0,
|
||
emissions_count=200,
|
||
)
|
||
|
||
|
||
def test_verdict_passes_when_at_least_80pct_within_100m() -> None:
|
||
# Act / Assert
|
||
assert verdict_passes_ac3(_passing_distribution()) is True
|
||
assert verdict_passes_ac3(_failing_distribution()) is False
|
||
|
||
|
||
def test_verdict_fails_on_zero_count() -> None:
|
||
# Arrange
|
||
empty = HorizontalErrorDistribution(
|
||
count=0,
|
||
horizontal_error_mean_m=0.0,
|
||
horizontal_error_p50_m=0.0,
|
||
horizontal_error_p95_m=0.0,
|
||
horizontal_error_p99_m=0.0,
|
||
threshold_hit_share={100.0: 0.0},
|
||
vertical_count=0,
|
||
vertical_error_mean_m=0.0,
|
||
vertical_error_p50_m=0.0,
|
||
vertical_error_p95_m=0.0,
|
||
)
|
||
|
||
# Act / Assert
|
||
assert verdict_passes_ac3(empty) is False
|
||
|
||
|
||
def test_failure_message_references_calibration_method_factory_sheet() -> None:
|
||
# Act
|
||
msg = format_failure_message(
|
||
_failing_distribution(),
|
||
_context(method="factory-sheet"),
|
||
)
|
||
|
||
# Assert: AZ-699 AC-3 — failure message attributes residual to the
|
||
# calibration acquisition method and surfaces the budget.
|
||
assert "AZ-699 AC-3" in msg
|
||
assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0
|
||
assert "100 m" in msg
|
||
assert f"{AC3_GATE_PCT:.0f} %" in msg
|
||
assert "Calibration: factory-sheet" in msg
|
||
assert "mean=180.0 m" in msg
|
||
assert "p95=420.0 m" in msg
|
||
assert "p99=580.0 m" in msg
|
||
|
||
|
||
def test_failure_message_references_calibration_method_placeholder() -> None:
|
||
# Act
|
||
msg = format_failure_message(
|
||
_failing_distribution(),
|
||
_context(method="placeholder"),
|
||
)
|
||
|
||
# Assert
|
||
assert "Calibration: placeholder" in msg
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
# Render report: AC-2 — every required row present
|
||
|
||
|
||
def test_render_report_contains_all_required_rows_on_pass() -> None:
|
||
# Act
|
||
text = render_report(
|
||
_passing_distribution(), _context(), passed=True
|
||
)
|
||
|
||
# Assert
|
||
assert text.startswith("# Real-flight validation — 2026-05-20")
|
||
assert "**Verdict**: PASS" in text
|
||
assert "## Horizontal error (metres)" in text
|
||
assert "## Threshold-hit share" in text
|
||
assert "Calibration acquisition method: factory-sheet" in text
|
||
for stat in ("Mean", "p50", "p95", "p99"):
|
||
assert f"| {stat} |" in text
|
||
for threshold in (10, 25, 50, 100):
|
||
assert f"| {threshold} |" in text
|
||
|
||
|
||
def test_render_report_marks_failure_when_below_gate() -> None:
|
||
# Act
|
||
text = render_report(
|
||
_failing_distribution(), _context(), passed=False
|
||
)
|
||
|
||
# Assert
|
||
assert "**Verdict**: FAIL" in text
|
||
# Threshold row for 100 m must show ~ 45 % (failing distribution).
|
||
assert "| 100 | 45.0 |" in text
|
||
|
||
|
||
def test_render_report_includes_vertical_when_available() -> None:
|
||
# Arrange
|
||
dist = HorizontalErrorDistribution(
|
||
count=10,
|
||
horizontal_error_mean_m=10.0,
|
||
horizontal_error_p50_m=10.0,
|
||
horizontal_error_p95_m=10.0,
|
||
horizontal_error_p99_m=10.0,
|
||
threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0},
|
||
vertical_count=10,
|
||
vertical_error_mean_m=4.5,
|
||
vertical_error_p50_m=4.0,
|
||
vertical_error_p95_m=8.5,
|
||
)
|
||
|
||
# Act
|
||
text = render_report(dist, _context(), passed=True)
|
||
|
||
# Assert
|
||
assert "## Vertical error (metres)" in text
|
||
assert "| Mean | 4.50 |" in text
|
||
assert "| p95 | 8.50 |" in text
|
||
assert "| Samples | 10 |" in text
|
||
|
||
|
||
def test_render_report_marks_vertical_skipped_when_no_samples() -> None:
|
||
# Arrange
|
||
dist = _passing_distribution() # vertical_count == 0
|
||
|
||
# Act
|
||
text = render_report(dist, _context(), passed=True)
|
||
|
||
# Assert
|
||
assert "_No emissions carried a comparable altitude" in text
|
||
|
||
|
||
def test_render_report_labels_ground_truth_source_agnostic_for_csv_path() -> None:
|
||
# Arrange — AZ-961 AC-2: rendered label is "Ground truth: …" for
|
||
# both tlog AND csv inputs, never "Tlog: …".
|
||
csv_context = ReportContext(
|
||
run_date_utc="2026-05-29",
|
||
ground_truth_path=Path("/tmp/data_imu.csv"),
|
||
video_path=Path("/tmp/nadir.mp4"),
|
||
calibration_acquisition_method="factory-sheet",
|
||
clip_duration_s=180.0,
|
||
emissions_count=200,
|
||
)
|
||
|
||
# Act
|
||
text = render_report(_passing_distribution(), csv_context, passed=True)
|
||
|
||
# Assert
|
||
assert "- Ground truth: `/tmp/data_imu.csv`" in text
|
||
assert "- Tlog:" not in text
|
||
|
||
|
||
def test_render_report_labels_ground_truth_source_agnostic_for_tlog_path() -> None:
|
||
# Arrange — AZ-961 AC-2: the same canonical label applies to tlog runs
|
||
|
||
# Act
|
||
text = render_report(_passing_distribution(), _context(), passed=True)
|
||
|
||
# Assert
|
||
assert "- Ground truth: `/tmp/derkachi.tlog`" in text
|
||
assert "- Tlog:" not in text
|