Files
gps-denied-onboard/tests/unit/test_az699_report_writer.py
T
Oleksandr Bezdieniezhnykh a3dc8e2636 [AZ-961] accuracy_report: rename tlog_path -> ground_truth_path
ReportContext.tlog_path was widened in-place by AZ-959 to mean
"ground-truth source path" without renaming, leaving the rendered
report's "- Tlog: <csv_path>" line cosmetically wrong for CSV
runs. This rename + label fix completes the cleanup.

- helpers/accuracy_report.py: field rename + docstring update +
  rendered line now reads "- Ground truth: <path>" for both
  inputs.
- replay_api/app.py: kwarg updated, AZ-959 inline comment about
  the overload removed (field name now carries the intent).
- tests/unit/test_az699_report_writer.py: fixture updated, two
  new symmetric tests assert the canonical label for tlog AND
  csv inputs (AC-2).
- tests/e2e/replay/_e2e_orchestrator.py +
  test_derkachi_real_tlog.py: kwarg updated.

Tests: 62/62 green across test_az699_report_writer.py,
test_az700_render_map.py, test_az701_replay_api.py.

CSV-replay-input chain (AZ-959 + AZ-960 + AZ-961) is now coherent:
- API accepts (video, csv) with XOR validation
- /static/example-csv serves the AZ-896 reference doc
- Runner dispatches --imu vs --tlog argv
- Report renders with source-agnostic "Ground truth:" label
- Map renders from CSV truth via gps-denied-render-map dispatch

Bookkeeping: AZ-961 spec moved todo/ → done/, dep-table preamble
eighth bump documents the rename + summarises the cycle-4 CSV
chain, state.md records batch 7 complete.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-29 12:55:57 +03:00

374 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""AZ-699 — Report writer + accuracy-distribution unit tests.
Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message
templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi
inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py``
and is skipped without the real video + ``RUN_REPLAY_E2E=1``.
Style: every test follows the Arrange / Act / Assert pattern.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
HorizontalErrorDistribution,
horizontal_error_distribution,
percentile_sorted,
)
from gps_denied_onboard.helpers.accuracy_report import (
AC3_GATE_PCT,
AC3_GATE_THRESHOLD_M,
ReportContext,
format_failure_message,
render_report,
verdict_passes_ac3,
)
# ---------------------------------------------------------------------
# percentile_sorted: pin numpy-equivalent linear interpolation
def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None:
# Arrange
values = [1.0, 2.0, 3.0, 4.0, 5.0]
# Act / Assert
assert percentile_sorted(values, 0.0) == 1.0
assert percentile_sorted(values, 100.0) == 5.0
assert percentile_sorted(values, 50.0) == 3.0
def test_percentile_sorted_p95_linear_interpolation() -> None:
# Arrange
values = list(range(0, 101)) # 0..100
# Act
p95 = percentile_sorted(values, 95.0)
# Assert: numpy's linear interpolation puts p95 at index 95 of
# [0..100] → value = 95.
assert p95 == pytest.approx(95.0, abs=1e-9)
def test_percentile_sorted_empty_returns_zero() -> None:
# Act / Assert
assert percentile_sorted([], 50.0) == 0.0
# ---------------------------------------------------------------------
# horizontal_error_distribution: walk emissions once, return aggregate
def test_distribution_zero_error_at_collocated_points() -> None:
# Arrange: estimator emissions perfectly match ground-truth rows.
gt = [
GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
{
"emitted_at": 1_000_000_000,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 2
assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6)
assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6)
for threshold in (10.0, 25.0, 50.0, 100.0):
assert dist.threshold_hit_share[threshold] == pytest.approx(1.0)
def test_distribution_threshold_hit_share_partials() -> None:
# Arrange: 4 emissions at increasing offsets from a single GT
# point. ~111 m per 0.001 deg lat near the equator → produce
# one each in the < 10, < 25, < 50, < 100 buckets so the
# threshold counters are unambiguous.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Distances are dominated by latitude offsets. Use deltas that
# bracket the buckets (5 m, 20 m, 40 m, 80 m).
emissions = [
{
"emitted_at": i,
"position_wgs84": {
"lat_deg": 50.0 + (offset_m / 111_000.0),
"lon_deg": 30.0,
"alt_m": 100.0,
},
}
for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0))
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 4
assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m
assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m
assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m
assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m
def test_distribution_vertical_skipped_when_alt_absent() -> None:
# Arrange: emissions without ``alt_m`` produce zero vertical
# samples; horizontal stats still compute.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.vertical_count == 0
assert dist.vertical_error_mean_m == 0.0
assert dist.count == 1
def test_distribution_empty_emissions_returns_zero_dist() -> None:
# Arrange
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Act
dist = horizontal_error_distribution([], gt)
# Assert
assert dist.count == 0
assert dist.horizontal_error_mean_m == 0.0
for share in dist.threshold_hit_share.values():
assert share == 0.0
def test_distribution_raises_on_empty_ground_truth() -> None:
# Arrange
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act / Assert
with pytest.raises(AssertionError):
horizontal_error_distribution(emissions, [])
# ---------------------------------------------------------------------
# Verdict + failure-message: AC-3 honesty
def _passing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=100,
horizontal_error_mean_m=20.0,
horizontal_error_p50_m=18.0,
horizontal_error_p95_m=70.0,
horizontal_error_p99_m=95.0,
threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _failing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=200,
horizontal_error_mean_m=180.0,
horizontal_error_p50_m=140.0,
horizontal_error_p95_m=420.0,
horizontal_error_p99_m=580.0,
threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _context(method: str = "factory-sheet") -> ReportContext:
return ReportContext(
run_date_utc="2026-05-20",
ground_truth_path=Path("/tmp/derkachi.tlog"),
video_path=Path("/tmp/flight_derkachi.mp4"),
calibration_acquisition_method=method,
clip_duration_s=180.0,
emissions_count=200,
)
def test_verdict_passes_when_at_least_80pct_within_100m() -> None:
# Act / Assert
assert verdict_passes_ac3(_passing_distribution()) is True
assert verdict_passes_ac3(_failing_distribution()) is False
def test_verdict_fails_on_zero_count() -> None:
# Arrange
empty = HorizontalErrorDistribution(
count=0,
horizontal_error_mean_m=0.0,
horizontal_error_p50_m=0.0,
horizontal_error_p95_m=0.0,
horizontal_error_p99_m=0.0,
threshold_hit_share={100.0: 0.0},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
# Act / Assert
assert verdict_passes_ac3(empty) is False
def test_failure_message_references_calibration_method_factory_sheet() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="factory-sheet"),
)
# Assert: AZ-699 AC-3 — failure message attributes residual to the
# calibration acquisition method and surfaces the budget.
assert "AZ-699 AC-3" in msg
assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0
assert "100 m" in msg
assert f"{AC3_GATE_PCT:.0f} %" in msg
assert "Calibration: factory-sheet" in msg
assert "mean=180.0 m" in msg
assert "p95=420.0 m" in msg
assert "p99=580.0 m" in msg
def test_failure_message_references_calibration_method_placeholder() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="placeholder"),
)
# Assert
assert "Calibration: placeholder" in msg
# ---------------------------------------------------------------------
# Render report: AC-2 — every required row present
def test_render_report_contains_all_required_rows_on_pass() -> None:
# Act
text = render_report(
_passing_distribution(), _context(), passed=True
)
# Assert
assert text.startswith("# Real-flight validation — 2026-05-20")
assert "**Verdict**: PASS" in text
assert "## Horizontal error (metres)" in text
assert "## Threshold-hit share" in text
assert "Calibration acquisition method: factory-sheet" in text
for stat in ("Mean", "p50", "p95", "p99"):
assert f"| {stat} |" in text
for threshold in (10, 25, 50, 100):
assert f"| {threshold} |" in text
def test_render_report_marks_failure_when_below_gate() -> None:
# Act
text = render_report(
_failing_distribution(), _context(), passed=False
)
# Assert
assert "**Verdict**: FAIL" in text
# Threshold row for 100 m must show ~ 45 % (failing distribution).
assert "| 100 | 45.0 |" in text
def test_render_report_includes_vertical_when_available() -> None:
# Arrange
dist = HorizontalErrorDistribution(
count=10,
horizontal_error_mean_m=10.0,
horizontal_error_p50_m=10.0,
horizontal_error_p95_m=10.0,
horizontal_error_p99_m=10.0,
threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0},
vertical_count=10,
vertical_error_mean_m=4.5,
vertical_error_p50_m=4.0,
vertical_error_p95_m=8.5,
)
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "## Vertical error (metres)" in text
assert "| Mean | 4.50 |" in text
assert "| p95 | 8.50 |" in text
assert "| Samples | 10 |" in text
def test_render_report_marks_vertical_skipped_when_no_samples() -> None:
# Arrange
dist = _passing_distribution() # vertical_count == 0
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "_No emissions carried a comparable altitude" in text
def test_render_report_labels_ground_truth_source_agnostic_for_csv_path() -> None:
# Arrange — AZ-961 AC-2: rendered label is "Ground truth: …" for
# both tlog AND csv inputs, never "Tlog: …".
csv_context = ReportContext(
run_date_utc="2026-05-29",
ground_truth_path=Path("/tmp/data_imu.csv"),
video_path=Path("/tmp/nadir.mp4"),
calibration_acquisition_method="factory-sheet",
clip_duration_s=180.0,
emissions_count=200,
)
# Act
text = render_report(_passing_distribution(), csv_context, passed=True)
# Assert
assert "- Ground truth: `/tmp/data_imu.csv`" in text
assert "- Tlog:" not in text
def test_render_report_labels_ground_truth_source_agnostic_for_tlog_path() -> None:
# Arrange — AZ-961 AC-2: the same canonical label applies to tlog runs
# Act
text = render_report(_passing_distribution(), _context(), passed=True)
# Assert
assert "- Ground truth: `/tmp/derkachi.tlog`" in text
assert "- Tlog:" not in text