[AZ-699] Real-flight validation runner + Markdown accuracy report

New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).

Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.

16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-20 16:53:48 +03:00
parent f5366bbca1
commit dcde602f61
9 changed files with 1261 additions and 2 deletions
+188
View File
@@ -0,0 +1,188 @@
"""AZ-699 Markdown accuracy-report writer (test helper).
Renders a :class:`HorizontalErrorDistribution` (the production
helper in ``gps_denied_onboard.helpers.gps_compare``) plus run
context (calibration acquisition method, clip duration, fixture
paths) into the canonical Markdown layout consumed by
``_docs/06_metrics/real_flight_validation_{date}.md``.
This module lives under ``tests/`` (NOT production) — the report
is an artefact of running the AZ-699 e2e test. Promoting the
writer to ``src/`` would invite production code to import a test
helper, so the file ownership rule keeps it here.
Style: every function is pure; the side effect (writing the file)
is the caller's. Tests in ``tests/unit/test_az699_report_writer.py``
exercise both the rendering and the threshold-gate verdict logic.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from gps_denied_onboard.helpers.gps_compare import HorizontalErrorDistribution
__all__ = [
"AC3_GATE_PCT",
"AC3_GATE_THRESHOLD_M",
"ReportContext",
"format_failure_message",
"render_report",
"verdict_passes_ac3",
]
# AZ-696 epic AC-3 threshold + minimum-share gate. Keeping these
# named constants here (rather than inlined into the test) so the
# unit tests for the failure-message template can pin them.
AC3_GATE_THRESHOLD_M: float = 100.0
AC3_GATE_PCT: float = 80.0
@dataclass(frozen=True)
class ReportContext:
"""Run context surfaced in the report header.
Attributes:
run_date_utc: ISO-8601 date (YYYY-MM-DD) at which the run
executed — drives the report filename.
tlog_path: Real tlog the runner consumed.
video_path: Video clip the runner consumed.
calibration_acquisition_method: Provenance of the camera
calibration (e.g. ``"factory-sheet"`` for AZ-702 or
``"placeholder"`` for the adti26 fallback). Surfaced in
the failure message per AZ-699 AC-3.
clip_duration_s: Duration of the analysed clip in seconds.
emissions_count: Total estimator-output records consumed
from the JSONL (may differ from
``distribution.count`` when some emissions land
outside the GT window).
"""
run_date_utc: str
tlog_path: Path
video_path: Path
calibration_acquisition_method: str
clip_duration_s: float
emissions_count: int
def verdict_passes_ac3(distribution: HorizontalErrorDistribution) -> bool:
"""Return ``True`` when the run meets AZ-696 epic AC-3."""
if distribution.count == 0:
return False
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M)
if share is None:
return False
return share * 100.0 >= AC3_GATE_PCT
def format_failure_message(
distribution: HorizontalErrorDistribution,
context: ReportContext,
) -> str:
"""Build the honest failure message for AZ-699 AC-3.
The message references the calibration acquisition method
(factory-sheet for AZ-702 or placeholder otherwise) and the
measured residual budget, so the operator can attribute a
failure to its likely root cause (calibration uncertainty,
drift, anchor scarcity) without re-reading the source.
"""
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M, 0.0)
pct = share * 100.0
return (
f"AZ-699 AC-3: only {pct:.1f} % of {distribution.count} "
f"emissions within {AC3_GATE_THRESHOLD_M:.0f} m of ground "
f"truth; epic threshold is {AC3_GATE_PCT:.0f} %. "
f"Residual: mean={distribution.horizontal_error_mean_m:.1f} m, "
f"p50={distribution.horizontal_error_p50_m:.1f} m, "
f"p95={distribution.horizontal_error_p95_m:.1f} m, "
f"p99={distribution.horizontal_error_p99_m:.1f} m. "
f"Calibration: {context.calibration_acquisition_method}. "
"See _docs/06_metrics/real_flight_validation_"
f"{context.run_date_utc}.md for the full distribution."
)
def render_report(
distribution: HorizontalErrorDistribution,
context: ReportContext,
*,
passed: bool,
) -> str:
"""Render the full Markdown report body.
The output layout (header + horizontal-error stats + threshold
table + vertical-error stats + verdict) is the schema referenced
by ``_docs/02_document/tests/blackbox-tests.md``.
"""
verdict = "PASS" if passed else "FAIL"
horiz_rows = [
("Mean", distribution.horizontal_error_mean_m),
("p50", distribution.horizontal_error_p50_m),
("p95", distribution.horizontal_error_p95_m),
("p99", distribution.horizontal_error_p99_m),
]
threshold_rows = [
(t, share)
for t, share in sorted(distribution.threshold_hit_share.items())
]
lines: list[str] = []
lines.append(f"# Real-flight validation — {context.run_date_utc}")
lines.append("")
lines.append(f"**Verdict**: {verdict} (AC-3 gate: "
f"{AC3_GATE_PCT:.0f} % within "
f"{AC3_GATE_THRESHOLD_M:.0f} m)")
lines.append("")
lines.append("## Run context")
lines.append("")
lines.append(f"- Tlog: `{context.tlog_path}`")
lines.append(f"- Video: `{context.video_path}`")
lines.append(
f"- Calibration acquisition method: {context.calibration_acquisition_method}"
)
lines.append(f"- Clip duration: {context.clip_duration_s:.1f} s")
lines.append(f"- Emissions consumed: {context.emissions_count}")
lines.append(f"- Ground-truth pairings: {distribution.count}")
lines.append("")
lines.append("## Horizontal error (metres)")
lines.append("")
lines.append("| Statistic | Value |")
lines.append("| --------- | ----- |")
for name, value in horiz_rows:
lines.append(f"| {name} | {value:.2f} |")
lines.append("")
lines.append("## Threshold-hit share")
lines.append("")
lines.append("| Threshold (m) | Hit share (%) |")
lines.append("| ------------- | ------------- |")
for threshold, share in threshold_rows:
lines.append(f"| {threshold:g} | {share * 100.0:.1f} |")
lines.append("")
if distribution.vertical_count > 0:
lines.append("## Vertical error (metres)")
lines.append("")
lines.append("| Statistic | Value |")
lines.append("| --------- | ----- |")
lines.append(
f"| Mean | {distribution.vertical_error_mean_m:.2f} |"
)
lines.append(
f"| p50 | {distribution.vertical_error_p50_m:.2f} |"
)
lines.append(
f"| p95 | {distribution.vertical_error_p95_m:.2f} |"
)
lines.append(
f"| Samples | {distribution.vertical_count} |"
)
lines.append("")
else:
lines.append("## Vertical error")
lines.append("")
lines.append("_No emissions carried a comparable altitude — vertical stats skipped._")
lines.append("")
return "\n".join(lines) + "\n"
+321
View File
@@ -0,0 +1,321 @@
"""AZ-699 — Real-flight validation runner + accuracy report.
Runs the ``gps-denied-replay`` console-script against the **real**
``derkachi.tlog`` (binary, not the AZ-265 synth) plus the real
flight video and the AZ-702 factory-sheet camera calibration,
computes the full horizontal-error distribution, and writes a
structured Markdown accuracy report to
``_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md``.
Unlike the AZ-404 1-min test (``test_derkachi_1min.py``), this one
emits an **honest** PASS/FAIL verdict — no ``@xfail`` mask. The
threshold gate is ≥ 80 % of emissions within 100 m (AZ-696 epic
AC-3). When the run fails, the failure message references the
calibration acquisition method (factory-sheet for AZ-702 or the
``adti26.json`` placeholder) so the root cause is attributable
without re-reading the source.
The test SKIPS (not ``@xfail``) when any prerequisite is absent:
* ``RUN_REPLAY_E2E`` env var not set (matches AZ-404 gating).
* Real ``derkachi.tlog`` binary missing.
* Real ``flight_derkachi.mp4`` missing OR placeholder-sized
(the repo currently ships a 134-byte placeholder for the
AZ-404 fixture; this AZ-699 path requires a decodable video).
* ``gps-denied-replay`` console-script not installed.
Style: every test follows the Arrange / Act / Assert pattern.
"""
from __future__ import annotations
import datetime
import json
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path
from typing import Any
import pytest
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
horizontal_error_distribution,
)
from gps_denied_onboard.replay_input import load_tlog_ground_truth
from tests.e2e.replay._report_writer import (
AC3_GATE_PCT,
AC3_GATE_THRESHOLD_M,
ReportContext,
format_failure_message,
render_report,
verdict_passes_ac3,
)
# ----------------------------------------------------------------------
# Skip gates
def _repo_root() -> Path:
return Path(__file__).resolve().parents[3]
def _derkachi_dir() -> Path:
return _repo_root() / "_docs" / "00_problem" / "input_data" / "flight_derkachi"
# Real video must be more than the placeholder size to actually
# decode. The current placeholder is 134 bytes; bump the floor to
# 1 MB so any real recording trips the gate but the placeholder
# never does.
_MIN_REAL_VIDEO_BYTES: int = 1_000_000
def _real_inputs_present() -> tuple[bool, str]:
derkachi = _derkachi_dir()
tlog = derkachi / "derkachi.tlog"
video = derkachi / "flight_derkachi.mp4"
calibration = derkachi / "khp20s30_factory.json"
if not tlog.is_file():
return False, f"real tlog missing: {tlog}"
if not video.is_file():
return False, f"real video missing: {video}"
if video.stat().st_size < _MIN_REAL_VIDEO_BYTES:
return (
False,
f"video at {video} is only {video.stat().st_size} bytes — "
"placeholder, not a real recording. Drop the real "
"flight_derkachi.mp4 into the fixture dir to enable AZ-699.",
)
if not calibration.is_file():
return False, (
f"calibration missing: {calibration}; ship the AZ-702 "
"factory-sheet calibration to enable AZ-699."
)
return True, ""
def _heavy_skip_reason() -> str | None:
if os.environ.get("RUN_REPLAY_E2E", "").lower() not in {
"1",
"true",
"yes",
"on",
}:
return "AZ-699 e2e test gated by RUN_REPLAY_E2E=1"
return None
def _binary_present() -> tuple[bool, str | None]:
binary = shutil.which("gps-denied-replay")
if binary is not None:
return True, binary
venv_bin = Path(sys.executable).parent / "gps-denied-replay"
if venv_bin.exists():
return True, str(venv_bin)
return False, None
# ----------------------------------------------------------------------
# Helpers
def _read_calibration_acquisition_method(calibration_path: Path) -> str:
"""Extract the ``acquisition_method`` field set by AZ-702.
AZ-702 documents that the camera calibration JSON carries an
``acquisition_method`` string ("factory-sheet" for the
KHP20S30 factory deliverable). When the field is absent we
label the run as ``"unknown"`` so the failure message stays
honest — pretending the calibration is factory-sheet when it
is actually a placeholder would mis-attribute the residual.
"""
try:
data = json.loads(calibration_path.read_text())
except (OSError, json.JSONDecodeError):
return "unknown"
method = data.get("acquisition_method")
if isinstance(method, str) and method:
return method
return "unknown"
def _parse_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for line in path.read_text().splitlines():
if not line.strip():
continue
rows.append(json.loads(line))
return rows
def _load_full_ground_truth(tlog_path: Path) -> list[GroundTruthRow]:
series = load_tlog_ground_truth(tlog_path).records
return [
GroundTruthRow(
t_s=fix.ts_ns / 1e9,
lat_deg=fix.lat_deg,
lon_deg=fix.lon_deg,
alt_m=fix.alt_m,
)
for fix in series
]
# ----------------------------------------------------------------------
# AC-1 + AC-2 + AC-3: full real-flight run + honest verdict + report
@pytest.mark.tier2
def test_az699_real_flight_validation_emits_verdict_and_report(
tmp_path: Path,
) -> None:
# Arrange: gate cleanly on missing prerequisites (no xfail).
heavy_reason = _heavy_skip_reason()
if heavy_reason is not None:
pytest.skip(heavy_reason)
inputs_ok, inputs_reason = _real_inputs_present()
if not inputs_ok:
pytest.skip(inputs_reason)
binary_ok, binary = _binary_present()
if not binary_ok or binary is None:
pytest.skip("gps-denied-replay console-script not installed")
derkachi = _derkachi_dir()
tlog_path = derkachi / "derkachi.tlog"
video_path = derkachi / "flight_derkachi.mp4"
calibration_path = derkachi / "khp20s30_factory.json"
output_path = tmp_path / "estimator_output.jsonl"
# Signing-key stub (NoopMavlinkTransport ignores bytes; path
# validation just needs the file to exist).
signing_key_path = tmp_path / "signing_key.bin"
signing_key_path.write_bytes(b"\x00" * 32)
config_path = tmp_path / "config.yaml"
config_path.write_text(
"mode: replay\n"
"replay:\n"
" pace: asap\n"
" target_fc_dialect: ardupilot_plane\n"
)
argv = [
binary,
"--video",
str(video_path),
"--tlog",
str(tlog_path),
"--output",
str(output_path),
"--camera-calibration",
str(calibration_path),
"--config",
str(config_path),
"--mavlink-signing-key",
str(signing_key_path),
"--pace",
"asap",
# AZ-698: let the segmenter + NCC find the last flight
# window in the multi-flight tlog. No manual offset.
"--auto-trim",
]
# Act
t0 = time.monotonic()
completed = subprocess.run(
argv,
capture_output=True,
text=True,
timeout=900, # 15-min NFR budget per AZ-699 spec
)
wall_clock_s = time.monotonic() - t0
assert completed.returncode == 0, (
f"gps-denied-replay exited {completed.returncode}\n"
f"stdout:\n{completed.stdout}\nstderr:\n{completed.stderr}"
)
assert wall_clock_s <= 900.0, (
f"AZ-699 NFR: real-flight run took {wall_clock_s:.1f} s; "
"budget is 15 min"
)
emissions = _parse_jsonl(output_path)
assert emissions, "no JSONL emissions produced — pipeline failure"
ground_truth = _load_full_ground_truth(tlog_path)
assert ground_truth, "ground-truth extraction yielded zero rows"
distribution = horizontal_error_distribution(
emissions, ground_truth
)
assert distribution.count > 0, (
"no emissions paired with ground truth — JSONL timestamps "
"outside the tlog GPS window?"
)
context = ReportContext(
run_date_utc=datetime.datetime.now(datetime.timezone.utc)
.date()
.isoformat(),
tlog_path=tlog_path,
video_path=video_path,
calibration_acquisition_method=_read_calibration_acquisition_method(
calibration_path
),
clip_duration_s=(
(ground_truth[-1].t_s - ground_truth[0].t_s)
if ground_truth
else 0.0
),
emissions_count=len(emissions),
)
passed = verdict_passes_ac3(distribution)
report_text = render_report(distribution, context, passed=passed)
report_dir = _repo_root() / "_docs" / "06_metrics"
report_dir.mkdir(parents=True, exist_ok=True)
report_path = (
report_dir / f"real_flight_validation_{context.run_date_utc}.md"
)
report_path.write_text(report_text)
# Assert AC-2: the report exists and contains all required rows.
assert report_path.is_file()
written = report_path.read_text()
assert "## Horizontal error (metres)" in written
assert "## Threshold-hit share" in written
assert "Mean" in written
assert "p50" in written
assert "p95" in written
assert "p99" in written
for threshold in (10, 25, 50, 100):
assert f"| {threshold} |" in written, (
f"threshold {threshold} m row missing from report"
)
# Assert AC-1 + AC-3: honest verdict. On FAIL, the message must
# reference the calibration method (AZ-699 AC-3).
if not passed:
msg = format_failure_message(distribution, context)
pytest.fail(
f"{msg}\n\nReport: {report_path}\n"
f"Run wall-clock: {wall_clock_s:.1f} s"
)
# PASS path — emit a concise summary so the CI log carries the
# numbers without forcing the operator to open the report.
share_100m = distribution.threshold_hit_share[AC3_GATE_THRESHOLD_M] * 100.0
assert share_100m >= AC3_GATE_PCT # guarded by verdict_passes_ac3
print(
f"AZ-699 PASS — {share_100m:.1f} % within "
f"{AC3_GATE_THRESHOLD_M:.0f} m, mean="
f"{distribution.horizontal_error_mean_m:.1f} m, p95="
f"{distribution.horizontal_error_p95_m:.1f} m. "
f"Report at {report_path}",
file=sys.stderr,
)
+342
View File
@@ -0,0 +1,342 @@
"""AZ-699 — Report writer + accuracy-distribution unit tests.
Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message
templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi
inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py``
and is skipped without the real video + ``RUN_REPLAY_E2E=1``.
Style: every test follows the Arrange / Act / Assert pattern.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
HorizontalErrorDistribution,
horizontal_error_distribution,
percentile_sorted,
)
from tests.e2e.replay._report_writer import (
AC3_GATE_PCT,
AC3_GATE_THRESHOLD_M,
ReportContext,
format_failure_message,
render_report,
verdict_passes_ac3,
)
# ---------------------------------------------------------------------
# percentile_sorted: pin numpy-equivalent linear interpolation
def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None:
# Arrange
values = [1.0, 2.0, 3.0, 4.0, 5.0]
# Act / Assert
assert percentile_sorted(values, 0.0) == 1.0
assert percentile_sorted(values, 100.0) == 5.0
assert percentile_sorted(values, 50.0) == 3.0
def test_percentile_sorted_p95_linear_interpolation() -> None:
# Arrange
values = list(range(0, 101)) # 0..100
# Act
p95 = percentile_sorted(values, 95.0)
# Assert: numpy's linear interpolation puts p95 at index 95 of
# [0..100] → value = 95.
assert p95 == pytest.approx(95.0, abs=1e-9)
def test_percentile_sorted_empty_returns_zero() -> None:
# Act / Assert
assert percentile_sorted([], 50.0) == 0.0
# ---------------------------------------------------------------------
# horizontal_error_distribution: walk emissions once, return aggregate
def test_distribution_zero_error_at_collocated_points() -> None:
# Arrange: estimator emissions perfectly match ground-truth rows.
gt = [
GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0),
]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
{
"emitted_at": 1_000_000_000,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0},
},
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 2
assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6)
assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6)
for threshold in (10.0, 25.0, 50.0, 100.0):
assert dist.threshold_hit_share[threshold] == pytest.approx(1.0)
def test_distribution_threshold_hit_share_partials() -> None:
# Arrange: 4 emissions at increasing offsets from a single GT
# point. ~111 m per 0.001 deg lat near the equator → produce
# one each in the < 10, < 25, < 50, < 100 buckets so the
# threshold counters are unambiguous.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Distances are dominated by latitude offsets. Use deltas that
# bracket the buckets (5 m, 20 m, 40 m, 80 m).
emissions = [
{
"emitted_at": i,
"position_wgs84": {
"lat_deg": 50.0 + (offset_m / 111_000.0),
"lon_deg": 30.0,
"alt_m": 100.0,
},
}
for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0))
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.count == 4
assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m
assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m
assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m
assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m
def test_distribution_vertical_skipped_when_alt_absent() -> None:
# Arrange: emissions without ``alt_m`` produce zero vertical
# samples; horizontal stats still compute.
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act
dist = horizontal_error_distribution(emissions, gt)
# Assert
assert dist.vertical_count == 0
assert dist.vertical_error_mean_m == 0.0
assert dist.count == 1
def test_distribution_empty_emissions_returns_zero_dist() -> None:
# Arrange
gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)]
# Act
dist = horizontal_error_distribution([], gt)
# Assert
assert dist.count == 0
assert dist.horizontal_error_mean_m == 0.0
for share in dist.threshold_hit_share.values():
assert share == 0.0
def test_distribution_raises_on_empty_ground_truth() -> None:
# Arrange
emissions = [
{
"emitted_at": 0,
"position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0},
}
]
# Act / Assert
with pytest.raises(AssertionError):
horizontal_error_distribution(emissions, [])
# ---------------------------------------------------------------------
# Verdict + failure-message: AC-3 honesty
def _passing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=100,
horizontal_error_mean_m=20.0,
horizontal_error_p50_m=18.0,
horizontal_error_p95_m=70.0,
horizontal_error_p99_m=95.0,
threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _failing_distribution() -> HorizontalErrorDistribution:
return HorizontalErrorDistribution(
count=200,
horizontal_error_mean_m=180.0,
horizontal_error_p50_m=140.0,
horizontal_error_p95_m=420.0,
horizontal_error_p99_m=580.0,
threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
def _context(method: str = "factory-sheet") -> ReportContext:
return ReportContext(
run_date_utc="2026-05-20",
tlog_path=Path("/tmp/derkachi.tlog"),
video_path=Path("/tmp/flight_derkachi.mp4"),
calibration_acquisition_method=method,
clip_duration_s=180.0,
emissions_count=200,
)
def test_verdict_passes_when_at_least_80pct_within_100m() -> None:
# Act / Assert
assert verdict_passes_ac3(_passing_distribution()) is True
assert verdict_passes_ac3(_failing_distribution()) is False
def test_verdict_fails_on_zero_count() -> None:
# Arrange
empty = HorizontalErrorDistribution(
count=0,
horizontal_error_mean_m=0.0,
horizontal_error_p50_m=0.0,
horizontal_error_p95_m=0.0,
horizontal_error_p99_m=0.0,
threshold_hit_share={100.0: 0.0},
vertical_count=0,
vertical_error_mean_m=0.0,
vertical_error_p50_m=0.0,
vertical_error_p95_m=0.0,
)
# Act / Assert
assert verdict_passes_ac3(empty) is False
def test_failure_message_references_calibration_method_factory_sheet() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="factory-sheet"),
)
# Assert: AZ-699 AC-3 — failure message attributes residual to the
# calibration acquisition method and surfaces the budget.
assert "AZ-699 AC-3" in msg
assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0
assert "100 m" in msg
assert f"{AC3_GATE_PCT:.0f} %" in msg
assert "Calibration: factory-sheet" in msg
assert "mean=180.0 m" in msg
assert "p95=420.0 m" in msg
assert "p99=580.0 m" in msg
def test_failure_message_references_calibration_method_placeholder() -> None:
# Act
msg = format_failure_message(
_failing_distribution(),
_context(method="placeholder"),
)
# Assert
assert "Calibration: placeholder" in msg
# ---------------------------------------------------------------------
# Render report: AC-2 — every required row present
def test_render_report_contains_all_required_rows_on_pass() -> None:
# Act
text = render_report(
_passing_distribution(), _context(), passed=True
)
# Assert
assert text.startswith("# Real-flight validation — 2026-05-20")
assert "**Verdict**: PASS" in text
assert "## Horizontal error (metres)" in text
assert "## Threshold-hit share" in text
assert "Calibration acquisition method: factory-sheet" in text
for stat in ("Mean", "p50", "p95", "p99"):
assert f"| {stat} |" in text
for threshold in (10, 25, 50, 100):
assert f"| {threshold} |" in text
def test_render_report_marks_failure_when_below_gate() -> None:
# Act
text = render_report(
_failing_distribution(), _context(), passed=False
)
# Assert
assert "**Verdict**: FAIL" in text
# Threshold row for 100 m must show ~ 45 % (failing distribution).
assert "| 100 | 45.0 |" in text
def test_render_report_includes_vertical_when_available() -> None:
# Arrange
dist = HorizontalErrorDistribution(
count=10,
horizontal_error_mean_m=10.0,
horizontal_error_p50_m=10.0,
horizontal_error_p95_m=10.0,
horizontal_error_p99_m=10.0,
threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0},
vertical_count=10,
vertical_error_mean_m=4.5,
vertical_error_p50_m=4.0,
vertical_error_p95_m=8.5,
)
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "## Vertical error (metres)" in text
assert "| Mean | 4.50 |" in text
assert "| p95 | 8.50 |" in text
assert "| Samples | 10 |" in text
def test_render_report_marks_vertical_skipped_when_no_samples() -> None:
# Arrange
dist = _passing_distribution() # vertical_count == 0
# Act
text = render_report(dist, _context(), passed=True)
# Assert
assert "_No emissions carried a comparable altitude" in text