mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-22 12:31:13 +00:00
[AZ-699] Real-flight validation runner + Markdown accuracy report
New e2e test runs gps-denied-replay --auto-trim against the real
derkachi.tlog + flight video + AZ-702 calibration, computes the
horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m
threshold-hit share), writes _docs/06_metrics/real_flight_
validation_{date}.md, and asserts honest PASS/FAIL with no @xfail
mask. AZ-404's 1-min test is untouched (sibling, not replacement).
Extends gps_compare.py with HorizontalErrorDistribution +
percentile_sorted (numpy-equivalent linear interpolation). New
test helper _report_writer.py renders the canonical Markdown
schema documented as FT-P-20 in blackbox-tests.md.
16 new unit tests pin distribution arithmetic, verdict gate,
failure-message templating (references calibration acquisition
method per AC-3), and report layout. 129 passed in focused
regression, 3 skipped (real video / Tier-2 prerequisites).
Zero new mypy --strict errors.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,188 @@
|
||||
"""AZ-699 Markdown accuracy-report writer (test helper).
|
||||
|
||||
Renders a :class:`HorizontalErrorDistribution` (the production
|
||||
helper in ``gps_denied_onboard.helpers.gps_compare``) plus run
|
||||
context (calibration acquisition method, clip duration, fixture
|
||||
paths) into the canonical Markdown layout consumed by
|
||||
``_docs/06_metrics/real_flight_validation_{date}.md``.
|
||||
|
||||
This module lives under ``tests/`` (NOT production) — the report
|
||||
is an artefact of running the AZ-699 e2e test. Promoting the
|
||||
writer to ``src/`` would invite production code to import a test
|
||||
helper, so the file ownership rule keeps it here.
|
||||
|
||||
Style: every function is pure; the side effect (writing the file)
|
||||
is the caller's. Tests in ``tests/unit/test_az699_report_writer.py``
|
||||
exercise both the rendering and the threshold-gate verdict logic.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from gps_denied_onboard.helpers.gps_compare import HorizontalErrorDistribution
|
||||
|
||||
__all__ = [
|
||||
"AC3_GATE_PCT",
|
||||
"AC3_GATE_THRESHOLD_M",
|
||||
"ReportContext",
|
||||
"format_failure_message",
|
||||
"render_report",
|
||||
"verdict_passes_ac3",
|
||||
]
|
||||
|
||||
|
||||
# AZ-696 epic AC-3 threshold + minimum-share gate. Keeping these
|
||||
# named constants here (rather than inlined into the test) so the
|
||||
# unit tests for the failure-message template can pin them.
|
||||
AC3_GATE_THRESHOLD_M: float = 100.0
|
||||
AC3_GATE_PCT: float = 80.0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ReportContext:
|
||||
"""Run context surfaced in the report header.
|
||||
|
||||
Attributes:
|
||||
run_date_utc: ISO-8601 date (YYYY-MM-DD) at which the run
|
||||
executed — drives the report filename.
|
||||
tlog_path: Real tlog the runner consumed.
|
||||
video_path: Video clip the runner consumed.
|
||||
calibration_acquisition_method: Provenance of the camera
|
||||
calibration (e.g. ``"factory-sheet"`` for AZ-702 or
|
||||
``"placeholder"`` for the adti26 fallback). Surfaced in
|
||||
the failure message per AZ-699 AC-3.
|
||||
clip_duration_s: Duration of the analysed clip in seconds.
|
||||
emissions_count: Total estimator-output records consumed
|
||||
from the JSONL (may differ from
|
||||
``distribution.count`` when some emissions land
|
||||
outside the GT window).
|
||||
"""
|
||||
|
||||
run_date_utc: str
|
||||
tlog_path: Path
|
||||
video_path: Path
|
||||
calibration_acquisition_method: str
|
||||
clip_duration_s: float
|
||||
emissions_count: int
|
||||
|
||||
|
||||
def verdict_passes_ac3(distribution: HorizontalErrorDistribution) -> bool:
|
||||
"""Return ``True`` when the run meets AZ-696 epic AC-3."""
|
||||
if distribution.count == 0:
|
||||
return False
|
||||
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M)
|
||||
if share is None:
|
||||
return False
|
||||
return share * 100.0 >= AC3_GATE_PCT
|
||||
|
||||
|
||||
def format_failure_message(
|
||||
distribution: HorizontalErrorDistribution,
|
||||
context: ReportContext,
|
||||
) -> str:
|
||||
"""Build the honest failure message for AZ-699 AC-3.
|
||||
|
||||
The message references the calibration acquisition method
|
||||
(factory-sheet for AZ-702 or placeholder otherwise) and the
|
||||
measured residual budget, so the operator can attribute a
|
||||
failure to its likely root cause (calibration uncertainty,
|
||||
drift, anchor scarcity) without re-reading the source.
|
||||
"""
|
||||
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M, 0.0)
|
||||
pct = share * 100.0
|
||||
return (
|
||||
f"AZ-699 AC-3: only {pct:.1f} % of {distribution.count} "
|
||||
f"emissions within {AC3_GATE_THRESHOLD_M:.0f} m of ground "
|
||||
f"truth; epic threshold is {AC3_GATE_PCT:.0f} %. "
|
||||
f"Residual: mean={distribution.horizontal_error_mean_m:.1f} m, "
|
||||
f"p50={distribution.horizontal_error_p50_m:.1f} m, "
|
||||
f"p95={distribution.horizontal_error_p95_m:.1f} m, "
|
||||
f"p99={distribution.horizontal_error_p99_m:.1f} m. "
|
||||
f"Calibration: {context.calibration_acquisition_method}. "
|
||||
"See _docs/06_metrics/real_flight_validation_"
|
||||
f"{context.run_date_utc}.md for the full distribution."
|
||||
)
|
||||
|
||||
|
||||
def render_report(
|
||||
distribution: HorizontalErrorDistribution,
|
||||
context: ReportContext,
|
||||
*,
|
||||
passed: bool,
|
||||
) -> str:
|
||||
"""Render the full Markdown report body.
|
||||
|
||||
The output layout (header + horizontal-error stats + threshold
|
||||
table + vertical-error stats + verdict) is the schema referenced
|
||||
by ``_docs/02_document/tests/blackbox-tests.md``.
|
||||
"""
|
||||
verdict = "PASS" if passed else "FAIL"
|
||||
horiz_rows = [
|
||||
("Mean", distribution.horizontal_error_mean_m),
|
||||
("p50", distribution.horizontal_error_p50_m),
|
||||
("p95", distribution.horizontal_error_p95_m),
|
||||
("p99", distribution.horizontal_error_p99_m),
|
||||
]
|
||||
threshold_rows = [
|
||||
(t, share)
|
||||
for t, share in sorted(distribution.threshold_hit_share.items())
|
||||
]
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append(f"# Real-flight validation — {context.run_date_utc}")
|
||||
lines.append("")
|
||||
lines.append(f"**Verdict**: {verdict} (AC-3 gate: "
|
||||
f"≥ {AC3_GATE_PCT:.0f} % within "
|
||||
f"{AC3_GATE_THRESHOLD_M:.0f} m)")
|
||||
lines.append("")
|
||||
lines.append("## Run context")
|
||||
lines.append("")
|
||||
lines.append(f"- Tlog: `{context.tlog_path}`")
|
||||
lines.append(f"- Video: `{context.video_path}`")
|
||||
lines.append(
|
||||
f"- Calibration acquisition method: {context.calibration_acquisition_method}"
|
||||
)
|
||||
lines.append(f"- Clip duration: {context.clip_duration_s:.1f} s")
|
||||
lines.append(f"- Emissions consumed: {context.emissions_count}")
|
||||
lines.append(f"- Ground-truth pairings: {distribution.count}")
|
||||
lines.append("")
|
||||
lines.append("## Horizontal error (metres)")
|
||||
lines.append("")
|
||||
lines.append("| Statistic | Value |")
|
||||
lines.append("| --------- | ----- |")
|
||||
for name, value in horiz_rows:
|
||||
lines.append(f"| {name} | {value:.2f} |")
|
||||
lines.append("")
|
||||
lines.append("## Threshold-hit share")
|
||||
lines.append("")
|
||||
lines.append("| Threshold (m) | Hit share (%) |")
|
||||
lines.append("| ------------- | ------------- |")
|
||||
for threshold, share in threshold_rows:
|
||||
lines.append(f"| {threshold:g} | {share * 100.0:.1f} |")
|
||||
lines.append("")
|
||||
if distribution.vertical_count > 0:
|
||||
lines.append("## Vertical error (metres)")
|
||||
lines.append("")
|
||||
lines.append("| Statistic | Value |")
|
||||
lines.append("| --------- | ----- |")
|
||||
lines.append(
|
||||
f"| Mean | {distribution.vertical_error_mean_m:.2f} |"
|
||||
)
|
||||
lines.append(
|
||||
f"| p50 | {distribution.vertical_error_p50_m:.2f} |"
|
||||
)
|
||||
lines.append(
|
||||
f"| p95 | {distribution.vertical_error_p95_m:.2f} |"
|
||||
)
|
||||
lines.append(
|
||||
f"| Samples | {distribution.vertical_count} |"
|
||||
)
|
||||
lines.append("")
|
||||
else:
|
||||
lines.append("## Vertical error")
|
||||
lines.append("")
|
||||
lines.append("_No emissions carried a comparable altitude — vertical stats skipped._")
|
||||
lines.append("")
|
||||
return "\n".join(lines) + "\n"
|
||||
@@ -0,0 +1,321 @@
|
||||
"""AZ-699 — Real-flight validation runner + accuracy report.
|
||||
|
||||
Runs the ``gps-denied-replay`` console-script against the **real**
|
||||
``derkachi.tlog`` (binary, not the AZ-265 synth) plus the real
|
||||
flight video and the AZ-702 factory-sheet camera calibration,
|
||||
computes the full horizontal-error distribution, and writes a
|
||||
structured Markdown accuracy report to
|
||||
``_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md``.
|
||||
|
||||
Unlike the AZ-404 1-min test (``test_derkachi_1min.py``), this one
|
||||
emits an **honest** PASS/FAIL verdict — no ``@xfail`` mask. The
|
||||
threshold gate is ≥ 80 % of emissions within 100 m (AZ-696 epic
|
||||
AC-3). When the run fails, the failure message references the
|
||||
calibration acquisition method (factory-sheet for AZ-702 or the
|
||||
``adti26.json`` placeholder) so the root cause is attributable
|
||||
without re-reading the source.
|
||||
|
||||
The test SKIPS (not ``@xfail``) when any prerequisite is absent:
|
||||
|
||||
* ``RUN_REPLAY_E2E`` env var not set (matches AZ-404 gating).
|
||||
* Real ``derkachi.tlog`` binary missing.
|
||||
* Real ``flight_derkachi.mp4`` missing OR placeholder-sized
|
||||
(the repo currently ships a 134-byte placeholder for the
|
||||
AZ-404 fixture; this AZ-699 path requires a decodable video).
|
||||
* ``gps-denied-replay`` console-script not installed.
|
||||
|
||||
Style: every test follows the Arrange / Act / Assert pattern.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from gps_denied_onboard.helpers.gps_compare import (
|
||||
GroundTruthRow,
|
||||
horizontal_error_distribution,
|
||||
)
|
||||
from gps_denied_onboard.replay_input import load_tlog_ground_truth
|
||||
from tests.e2e.replay._report_writer import (
|
||||
AC3_GATE_PCT,
|
||||
AC3_GATE_THRESHOLD_M,
|
||||
ReportContext,
|
||||
format_failure_message,
|
||||
render_report,
|
||||
verdict_passes_ac3,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Skip gates
|
||||
|
||||
|
||||
def _repo_root() -> Path:
|
||||
return Path(__file__).resolve().parents[3]
|
||||
|
||||
|
||||
def _derkachi_dir() -> Path:
|
||||
return _repo_root() / "_docs" / "00_problem" / "input_data" / "flight_derkachi"
|
||||
|
||||
|
||||
# Real video must be more than the placeholder size to actually
|
||||
# decode. The current placeholder is 134 bytes; bump the floor to
|
||||
# 1 MB so any real recording trips the gate but the placeholder
|
||||
# never does.
|
||||
_MIN_REAL_VIDEO_BYTES: int = 1_000_000
|
||||
|
||||
|
||||
def _real_inputs_present() -> tuple[bool, str]:
|
||||
derkachi = _derkachi_dir()
|
||||
tlog = derkachi / "derkachi.tlog"
|
||||
video = derkachi / "flight_derkachi.mp4"
|
||||
calibration = derkachi / "khp20s30_factory.json"
|
||||
if not tlog.is_file():
|
||||
return False, f"real tlog missing: {tlog}"
|
||||
if not video.is_file():
|
||||
return False, f"real video missing: {video}"
|
||||
if video.stat().st_size < _MIN_REAL_VIDEO_BYTES:
|
||||
return (
|
||||
False,
|
||||
f"video at {video} is only {video.stat().st_size} bytes — "
|
||||
"placeholder, not a real recording. Drop the real "
|
||||
"flight_derkachi.mp4 into the fixture dir to enable AZ-699.",
|
||||
)
|
||||
if not calibration.is_file():
|
||||
return False, (
|
||||
f"calibration missing: {calibration}; ship the AZ-702 "
|
||||
"factory-sheet calibration to enable AZ-699."
|
||||
)
|
||||
return True, ""
|
||||
|
||||
|
||||
def _heavy_skip_reason() -> str | None:
|
||||
if os.environ.get("RUN_REPLAY_E2E", "").lower() not in {
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
"on",
|
||||
}:
|
||||
return "AZ-699 e2e test gated by RUN_REPLAY_E2E=1"
|
||||
return None
|
||||
|
||||
|
||||
def _binary_present() -> tuple[bool, str | None]:
|
||||
binary = shutil.which("gps-denied-replay")
|
||||
if binary is not None:
|
||||
return True, binary
|
||||
venv_bin = Path(sys.executable).parent / "gps-denied-replay"
|
||||
if venv_bin.exists():
|
||||
return True, str(venv_bin)
|
||||
return False, None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Helpers
|
||||
|
||||
|
||||
def _read_calibration_acquisition_method(calibration_path: Path) -> str:
|
||||
"""Extract the ``acquisition_method`` field set by AZ-702.
|
||||
|
||||
AZ-702 documents that the camera calibration JSON carries an
|
||||
``acquisition_method`` string ("factory-sheet" for the
|
||||
KHP20S30 factory deliverable). When the field is absent we
|
||||
label the run as ``"unknown"`` so the failure message stays
|
||||
honest — pretending the calibration is factory-sheet when it
|
||||
is actually a placeholder would mis-attribute the residual.
|
||||
"""
|
||||
try:
|
||||
data = json.loads(calibration_path.read_text())
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return "unknown"
|
||||
method = data.get("acquisition_method")
|
||||
if isinstance(method, str) and method:
|
||||
return method
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _parse_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
for line in path.read_text().splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def _load_full_ground_truth(tlog_path: Path) -> list[GroundTruthRow]:
|
||||
series = load_tlog_ground_truth(tlog_path).records
|
||||
return [
|
||||
GroundTruthRow(
|
||||
t_s=fix.ts_ns / 1e9,
|
||||
lat_deg=fix.lat_deg,
|
||||
lon_deg=fix.lon_deg,
|
||||
alt_m=fix.alt_m,
|
||||
)
|
||||
for fix in series
|
||||
]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-1 + AC-2 + AC-3: full real-flight run + honest verdict + report
|
||||
|
||||
|
||||
@pytest.mark.tier2
|
||||
def test_az699_real_flight_validation_emits_verdict_and_report(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
# Arrange: gate cleanly on missing prerequisites (no xfail).
|
||||
heavy_reason = _heavy_skip_reason()
|
||||
if heavy_reason is not None:
|
||||
pytest.skip(heavy_reason)
|
||||
inputs_ok, inputs_reason = _real_inputs_present()
|
||||
if not inputs_ok:
|
||||
pytest.skip(inputs_reason)
|
||||
binary_ok, binary = _binary_present()
|
||||
if not binary_ok or binary is None:
|
||||
pytest.skip("gps-denied-replay console-script not installed")
|
||||
|
||||
derkachi = _derkachi_dir()
|
||||
tlog_path = derkachi / "derkachi.tlog"
|
||||
video_path = derkachi / "flight_derkachi.mp4"
|
||||
calibration_path = derkachi / "khp20s30_factory.json"
|
||||
output_path = tmp_path / "estimator_output.jsonl"
|
||||
|
||||
# Signing-key stub (NoopMavlinkTransport ignores bytes; path
|
||||
# validation just needs the file to exist).
|
||||
signing_key_path = tmp_path / "signing_key.bin"
|
||||
signing_key_path.write_bytes(b"\x00" * 32)
|
||||
|
||||
config_path = tmp_path / "config.yaml"
|
||||
config_path.write_text(
|
||||
"mode: replay\n"
|
||||
"replay:\n"
|
||||
" pace: asap\n"
|
||||
" target_fc_dialect: ardupilot_plane\n"
|
||||
)
|
||||
|
||||
argv = [
|
||||
binary,
|
||||
"--video",
|
||||
str(video_path),
|
||||
"--tlog",
|
||||
str(tlog_path),
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--camera-calibration",
|
||||
str(calibration_path),
|
||||
"--config",
|
||||
str(config_path),
|
||||
"--mavlink-signing-key",
|
||||
str(signing_key_path),
|
||||
"--pace",
|
||||
"asap",
|
||||
# AZ-698: let the segmenter + NCC find the last flight
|
||||
# window in the multi-flight tlog. No manual offset.
|
||||
"--auto-trim",
|
||||
]
|
||||
|
||||
# Act
|
||||
t0 = time.monotonic()
|
||||
completed = subprocess.run(
|
||||
argv,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900, # 15-min NFR budget per AZ-699 spec
|
||||
)
|
||||
wall_clock_s = time.monotonic() - t0
|
||||
|
||||
assert completed.returncode == 0, (
|
||||
f"gps-denied-replay exited {completed.returncode}\n"
|
||||
f"stdout:\n{completed.stdout}\nstderr:\n{completed.stderr}"
|
||||
)
|
||||
assert wall_clock_s <= 900.0, (
|
||||
f"AZ-699 NFR: real-flight run took {wall_clock_s:.1f} s; "
|
||||
"budget is 15 min"
|
||||
)
|
||||
|
||||
emissions = _parse_jsonl(output_path)
|
||||
assert emissions, "no JSONL emissions produced — pipeline failure"
|
||||
|
||||
ground_truth = _load_full_ground_truth(tlog_path)
|
||||
assert ground_truth, "ground-truth extraction yielded zero rows"
|
||||
distribution = horizontal_error_distribution(
|
||||
emissions, ground_truth
|
||||
)
|
||||
assert distribution.count > 0, (
|
||||
"no emissions paired with ground truth — JSONL timestamps "
|
||||
"outside the tlog GPS window?"
|
||||
)
|
||||
|
||||
context = ReportContext(
|
||||
run_date_utc=datetime.datetime.now(datetime.timezone.utc)
|
||||
.date()
|
||||
.isoformat(),
|
||||
tlog_path=tlog_path,
|
||||
video_path=video_path,
|
||||
calibration_acquisition_method=_read_calibration_acquisition_method(
|
||||
calibration_path
|
||||
),
|
||||
clip_duration_s=(
|
||||
(ground_truth[-1].t_s - ground_truth[0].t_s)
|
||||
if ground_truth
|
||||
else 0.0
|
||||
),
|
||||
emissions_count=len(emissions),
|
||||
)
|
||||
|
||||
passed = verdict_passes_ac3(distribution)
|
||||
report_text = render_report(distribution, context, passed=passed)
|
||||
|
||||
report_dir = _repo_root() / "_docs" / "06_metrics"
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
report_path = (
|
||||
report_dir / f"real_flight_validation_{context.run_date_utc}.md"
|
||||
)
|
||||
report_path.write_text(report_text)
|
||||
|
||||
# Assert AC-2: the report exists and contains all required rows.
|
||||
assert report_path.is_file()
|
||||
written = report_path.read_text()
|
||||
assert "## Horizontal error (metres)" in written
|
||||
assert "## Threshold-hit share" in written
|
||||
assert "Mean" in written
|
||||
assert "p50" in written
|
||||
assert "p95" in written
|
||||
assert "p99" in written
|
||||
for threshold in (10, 25, 50, 100):
|
||||
assert f"| {threshold} |" in written, (
|
||||
f"threshold {threshold} m row missing from report"
|
||||
)
|
||||
|
||||
# Assert AC-1 + AC-3: honest verdict. On FAIL, the message must
|
||||
# reference the calibration method (AZ-699 AC-3).
|
||||
if not passed:
|
||||
msg = format_failure_message(distribution, context)
|
||||
pytest.fail(
|
||||
f"{msg}\n\nReport: {report_path}\n"
|
||||
f"Run wall-clock: {wall_clock_s:.1f} s"
|
||||
)
|
||||
|
||||
# PASS path — emit a concise summary so the CI log carries the
|
||||
# numbers without forcing the operator to open the report.
|
||||
share_100m = distribution.threshold_hit_share[AC3_GATE_THRESHOLD_M] * 100.0
|
||||
assert share_100m >= AC3_GATE_PCT # guarded by verdict_passes_ac3
|
||||
print(
|
||||
f"AZ-699 PASS — {share_100m:.1f} % within "
|
||||
f"{AC3_GATE_THRESHOLD_M:.0f} m, mean="
|
||||
f"{distribution.horizontal_error_mean_m:.1f} m, p95="
|
||||
f"{distribution.horizontal_error_p95_m:.1f} m. "
|
||||
f"Report at {report_path}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
Reference in New Issue
Block a user