[AZ-699] Real-flight validation runner + Markdown accuracy report

New e2e test runs gps-denied-replay --auto-trim against the real derkachi.tlog + flight video + AZ-702 calibration, computes the horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m threshold-hit share), writes _docs/06_metrics/real_flight_ validation_{date}.md, and asserts honest PASS/FAIL with no @xfail mask. AZ-404's 1-min test is untouched (sibling, not replacement). Extends gps_compare.py with HorizontalErrorDistribution + percentile_sorted (numpy-equivalent linear interpolation). New test helper _report_writer.py renders the canonical Markdown schema documented as FT-P-20 in blackbox-tests.md. 16 new unit tests pin distribution arithmetic, verdict gate, failure-message templating (references calibration acquisition method per AC-3), and report layout. 129 passed in focused regression, 3 skipped (real video / Tier-2 prerequisites). Zero new mypy --strict errors. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 12:31:13 +00:00 · 2026-05-20 16:53:48 +03:00
parent f5366bbca1
commit dcde602f61
9 changed files with 1261 additions and 2 deletions
@@ -0,0 +1,188 @@
+"""AZ-699 Markdown accuracy-report writer (test helper).
+
+Renders a :class:`HorizontalErrorDistribution` (the production
+helper in ``gps_denied_onboard.helpers.gps_compare``) plus run
+context (calibration acquisition method, clip duration, fixture
+paths) into the canonical Markdown layout consumed by
+``_docs/06_metrics/real_flight_validation_{date}.md``.
+
+This module lives under ``tests/`` (NOT production) — the report
+is an artefact of running the AZ-699 e2e test. Promoting the
+writer to ``src/`` would invite production code to import a test
+helper, so the file ownership rule keeps it here.
+
+Style: every function is pure; the side effect (writing the file)
+is the caller's. Tests in ``tests/unit/test_az699_report_writer.py``
+exercise both the rendering and the threshold-gate verdict logic.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from gps_denied_onboard.helpers.gps_compare import HorizontalErrorDistribution
+
+__all__ = [
+    "AC3_GATE_PCT",
+    "AC3_GATE_THRESHOLD_M",
+    "ReportContext",
+    "format_failure_message",
+    "render_report",
+    "verdict_passes_ac3",
+]
+
+
+# AZ-696 epic AC-3 threshold + minimum-share gate. Keeping these
+# named constants here (rather than inlined into the test) so the
+# unit tests for the failure-message template can pin them.
+AC3_GATE_THRESHOLD_M: float = 100.0
+AC3_GATE_PCT: float = 80.0
+
+
+@dataclass(frozen=True)
+class ReportContext:
+    """Run context surfaced in the report header.
+
+    Attributes:
+        run_date_utc: ISO-8601 date (YYYY-MM-DD) at which the run
+            executed — drives the report filename.
+        tlog_path: Real tlog the runner consumed.
+        video_path: Video clip the runner consumed.
+        calibration_acquisition_method: Provenance of the camera
+            calibration (e.g. ``"factory-sheet"`` for AZ-702 or
+            ``"placeholder"`` for the adti26 fallback). Surfaced in
+            the failure message per AZ-699 AC-3.
+        clip_duration_s: Duration of the analysed clip in seconds.
+        emissions_count: Total estimator-output records consumed
+            from the JSONL (may differ from
+            ``distribution.count`` when some emissions land
+            outside the GT window).
+    """
+
+    run_date_utc: str
+    tlog_path: Path
+    video_path: Path
+    calibration_acquisition_method: str
+    clip_duration_s: float
+    emissions_count: int
+
+
+def verdict_passes_ac3(distribution: HorizontalErrorDistribution) -> bool:
+    """Return ``True`` when the run meets AZ-696 epic AC-3."""
+    if distribution.count == 0:
+        return False
+    share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M)
+    if share is None:
+        return False
+    return share * 100.0 >= AC3_GATE_PCT
+
+
+def format_failure_message(
+    distribution: HorizontalErrorDistribution,
+    context: ReportContext,
+) -> str:
+    """Build the honest failure message for AZ-699 AC-3.
+
+    The message references the calibration acquisition method
+    (factory-sheet for AZ-702 or placeholder otherwise) and the
+    measured residual budget, so the operator can attribute a
+    failure to its likely root cause (calibration uncertainty,
+    drift, anchor scarcity) without re-reading the source.
+    """
+    share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M, 0.0)
+    pct = share * 100.0
+    return (
+        f"AZ-699 AC-3: only {pct:.1f} % of {distribution.count} "
+        f"emissions within {AC3_GATE_THRESHOLD_M:.0f} m of ground "
+        f"truth; epic threshold is {AC3_GATE_PCT:.0f} %. "
+        f"Residual: mean={distribution.horizontal_error_mean_m:.1f} m, "
+        f"p50={distribution.horizontal_error_p50_m:.1f} m, "
+        f"p95={distribution.horizontal_error_p95_m:.1f} m, "
+        f"p99={distribution.horizontal_error_p99_m:.1f} m. "
+        f"Calibration: {context.calibration_acquisition_method}. "
+        "See _docs/06_metrics/real_flight_validation_"
+        f"{context.run_date_utc}.md for the full distribution."
+    )
+
+
+def render_report(
+    distribution: HorizontalErrorDistribution,
+    context: ReportContext,
+    *,
+    passed: bool,
+) -> str:
+    """Render the full Markdown report body.
+
+    The output layout (header + horizontal-error stats + threshold
+    table + vertical-error stats + verdict) is the schema referenced
+    by ``_docs/02_document/tests/blackbox-tests.md``.
+    """
+    verdict = "PASS" if passed else "FAIL"
+    horiz_rows = [
+        ("Mean", distribution.horizontal_error_mean_m),
+        ("p50", distribution.horizontal_error_p50_m),
+        ("p95", distribution.horizontal_error_p95_m),
+        ("p99", distribution.horizontal_error_p99_m),
+    ]
+    threshold_rows = [
+        (t, share)
+        for t, share in sorted(distribution.threshold_hit_share.items())
+    ]
+
+    lines: list[str] = []
+    lines.append(f"# Real-flight validation — {context.run_date_utc}")
+    lines.append("")
+    lines.append(f"**Verdict**: {verdict} (AC-3 gate: "
+                 f"≥ {AC3_GATE_PCT:.0f} % within "
+                 f"{AC3_GATE_THRESHOLD_M:.0f} m)")
+    lines.append("")
+    lines.append("## Run context")
+    lines.append("")
+    lines.append(f"- Tlog: `{context.tlog_path}`")
+    lines.append(f"- Video: `{context.video_path}`")
+    lines.append(
+        f"- Calibration acquisition method: {context.calibration_acquisition_method}"
+    )
+    lines.append(f"- Clip duration: {context.clip_duration_s:.1f} s")
+    lines.append(f"- Emissions consumed: {context.emissions_count}")
+    lines.append(f"- Ground-truth pairings: {distribution.count}")
+    lines.append("")
+    lines.append("## Horizontal error (metres)")
+    lines.append("")
+    lines.append("| Statistic | Value |")
+    lines.append("| --------- | ----- |")
+    for name, value in horiz_rows:
+        lines.append(f"| {name} | {value:.2f} |")
+    lines.append("")
+    lines.append("## Threshold-hit share")
+    lines.append("")
+    lines.append("| Threshold (m) | Hit share (%) |")
+    lines.append("| ------------- | ------------- |")
+    for threshold, share in threshold_rows:
+        lines.append(f"| {threshold:g} | {share * 100.0:.1f} |")
+    lines.append("")
+    if distribution.vertical_count > 0:
+        lines.append("## Vertical error (metres)")
+        lines.append("")
+        lines.append("| Statistic | Value |")
+        lines.append("| --------- | ----- |")
+        lines.append(
+            f"| Mean | {distribution.vertical_error_mean_m:.2f} |"
+        )
+        lines.append(
+            f"| p50 | {distribution.vertical_error_p50_m:.2f} |"
+        )
+        lines.append(
+            f"| p95 | {distribution.vertical_error_p95_m:.2f} |"
+        )
+        lines.append(
+            f"| Samples | {distribution.vertical_count} |"
+        )
+        lines.append("")
+    else:
+        lines.append("## Vertical error")
+        lines.append("")
+        lines.append("_No emissions carried a comparable altitude — vertical stats skipped._")
+        lines.append("")
+    return "\n".join(lines) + "\n"
@@ -0,0 +1,321 @@
+"""AZ-699 — Real-flight validation runner + accuracy report.
+
+Runs the ``gps-denied-replay`` console-script against the **real**
+``derkachi.tlog`` (binary, not the AZ-265 synth) plus the real
+flight video and the AZ-702 factory-sheet camera calibration,
+computes the full horizontal-error distribution, and writes a
+structured Markdown accuracy report to
+``_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md``.
+
+Unlike the AZ-404 1-min test (``test_derkachi_1min.py``), this one
+emits an **honest** PASS/FAIL verdict — no ``@xfail`` mask. The
+threshold gate is ≥ 80 % of emissions within 100 m (AZ-696 epic
+AC-3). When the run fails, the failure message references the
+calibration acquisition method (factory-sheet for AZ-702 or the
+``adti26.json`` placeholder) so the root cause is attributable
+without re-reading the source.
+
+The test SKIPS (not ``@xfail``) when any prerequisite is absent:
+
+* ``RUN_REPLAY_E2E`` env var not set (matches AZ-404 gating).
+* Real ``derkachi.tlog`` binary missing.
+* Real ``flight_derkachi.mp4`` missing OR placeholder-sized
+  (the repo currently ships a 134-byte placeholder for the
+  AZ-404 fixture; this AZ-699 path requires a decodable video).
+* ``gps-denied-replay`` console-script not installed.
+
+Style: every test follows the Arrange / Act / Assert pattern.
+"""
+
+from __future__ import annotations
+
+import datetime
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from gps_denied_onboard.helpers.gps_compare import (
+    GroundTruthRow,
+    horizontal_error_distribution,
+)
+from gps_denied_onboard.replay_input import load_tlog_ground_truth
+from tests.e2e.replay._report_writer import (
+    AC3_GATE_PCT,
+    AC3_GATE_THRESHOLD_M,
+    ReportContext,
+    format_failure_message,
+    render_report,
+    verdict_passes_ac3,
+)
+
+
+# ----------------------------------------------------------------------
+# Skip gates
+
+
+def _repo_root() -> Path:
+    return Path(__file__).resolve().parents[3]
+
+
+def _derkachi_dir() -> Path:
+    return _repo_root() / "_docs" / "00_problem" / "input_data" / "flight_derkachi"
+
+
+# Real video must be more than the placeholder size to actually
+# decode. The current placeholder is 134 bytes; bump the floor to
+# 1 MB so any real recording trips the gate but the placeholder
+# never does.
+_MIN_REAL_VIDEO_BYTES: int = 1_000_000
+
+
+def _real_inputs_present() -> tuple[bool, str]:
+    derkachi = _derkachi_dir()
+    tlog = derkachi / "derkachi.tlog"
+    video = derkachi / "flight_derkachi.mp4"
+    calibration = derkachi / "khp20s30_factory.json"
+    if not tlog.is_file():
+        return False, f"real tlog missing: {tlog}"
+    if not video.is_file():
+        return False, f"real video missing: {video}"
+    if video.stat().st_size < _MIN_REAL_VIDEO_BYTES:
+        return (
+            False,
+            f"video at {video} is only {video.stat().st_size} bytes — "
+            "placeholder, not a real recording. Drop the real "
+            "flight_derkachi.mp4 into the fixture dir to enable AZ-699.",
+        )
+    if not calibration.is_file():
+        return False, (
+            f"calibration missing: {calibration}; ship the AZ-702 "
+            "factory-sheet calibration to enable AZ-699."
+        )
+    return True, ""
+
+
+def _heavy_skip_reason() -> str | None:
+    if os.environ.get("RUN_REPLAY_E2E", "").lower() not in {
+        "1",
+        "true",
+        "yes",
+        "on",
+    }:
+        return "AZ-699 e2e test gated by RUN_REPLAY_E2E=1"
+    return None
+
+
+def _binary_present() -> tuple[bool, str | None]:
+    binary = shutil.which("gps-denied-replay")
+    if binary is not None:
+        return True, binary
+    venv_bin = Path(sys.executable).parent / "gps-denied-replay"
+    if venv_bin.exists():
+        return True, str(venv_bin)
+    return False, None
+
+
+# ----------------------------------------------------------------------
+# Helpers
+
+
+def _read_calibration_acquisition_method(calibration_path: Path) -> str:
+    """Extract the ``acquisition_method`` field set by AZ-702.
+
+    AZ-702 documents that the camera calibration JSON carries an
+    ``acquisition_method`` string ("factory-sheet" for the
+    KHP20S30 factory deliverable). When the field is absent we
+    label the run as ``"unknown"`` so the failure message stays
+    honest — pretending the calibration is factory-sheet when it
+    is actually a placeholder would mis-attribute the residual.
+    """
+    try:
+        data = json.loads(calibration_path.read_text())
+    except (OSError, json.JSONDecodeError):
+        return "unknown"
+    method = data.get("acquisition_method")
+    if isinstance(method, str) and method:
+        return method
+    return "unknown"
+
+
+def _parse_jsonl(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for line in path.read_text().splitlines():
+        if not line.strip():
+            continue
+        rows.append(json.loads(line))
+    return rows
+
+
+def _load_full_ground_truth(tlog_path: Path) -> list[GroundTruthRow]:
+    series = load_tlog_ground_truth(tlog_path).records
+    return [
+        GroundTruthRow(
+            t_s=fix.ts_ns / 1e9,
+            lat_deg=fix.lat_deg,
+            lon_deg=fix.lon_deg,
+            alt_m=fix.alt_m,
+        )
+        for fix in series
+    ]
+
+
+# ----------------------------------------------------------------------
+# AC-1 + AC-2 + AC-3: full real-flight run + honest verdict + report
+
+
+@pytest.mark.tier2
+def test_az699_real_flight_validation_emits_verdict_and_report(
+    tmp_path: Path,
+) -> None:
+    # Arrange: gate cleanly on missing prerequisites (no xfail).
+    heavy_reason = _heavy_skip_reason()
+    if heavy_reason is not None:
+        pytest.skip(heavy_reason)
+    inputs_ok, inputs_reason = _real_inputs_present()
+    if not inputs_ok:
+        pytest.skip(inputs_reason)
+    binary_ok, binary = _binary_present()
+    if not binary_ok or binary is None:
+        pytest.skip("gps-denied-replay console-script not installed")
+
+    derkachi = _derkachi_dir()
+    tlog_path = derkachi / "derkachi.tlog"
+    video_path = derkachi / "flight_derkachi.mp4"
+    calibration_path = derkachi / "khp20s30_factory.json"
+    output_path = tmp_path / "estimator_output.jsonl"
+
+    # Signing-key stub (NoopMavlinkTransport ignores bytes; path
+    # validation just needs the file to exist).
+    signing_key_path = tmp_path / "signing_key.bin"
+    signing_key_path.write_bytes(b"\x00" * 32)
+
+    config_path = tmp_path / "config.yaml"
+    config_path.write_text(
+        "mode: replay\n"
+        "replay:\n"
+        "  pace: asap\n"
+        "  target_fc_dialect: ardupilot_plane\n"
+    )
+
+    argv = [
+        binary,
+        "--video",
+        str(video_path),
+        "--tlog",
+        str(tlog_path),
+        "--output",
+        str(output_path),
+        "--camera-calibration",
+        str(calibration_path),
+        "--config",
+        str(config_path),
+        "--mavlink-signing-key",
+        str(signing_key_path),
+        "--pace",
+        "asap",
+        # AZ-698: let the segmenter + NCC find the last flight
+        # window in the multi-flight tlog. No manual offset.
+        "--auto-trim",
+    ]
+
+    # Act
+    t0 = time.monotonic()
+    completed = subprocess.run(
+        argv,
+        capture_output=True,
+        text=True,
+        timeout=900,  # 15-min NFR budget per AZ-699 spec
+    )
+    wall_clock_s = time.monotonic() - t0
+
+    assert completed.returncode == 0, (
+        f"gps-denied-replay exited {completed.returncode}\n"
+        f"stdout:\n{completed.stdout}\nstderr:\n{completed.stderr}"
+    )
+    assert wall_clock_s <= 900.0, (
+        f"AZ-699 NFR: real-flight run took {wall_clock_s:.1f} s; "
+        "budget is 15 min"
+    )
+
+    emissions = _parse_jsonl(output_path)
+    assert emissions, "no JSONL emissions produced — pipeline failure"
+
+    ground_truth = _load_full_ground_truth(tlog_path)
+    assert ground_truth, "ground-truth extraction yielded zero rows"
+    distribution = horizontal_error_distribution(
+        emissions, ground_truth
+    )
+    assert distribution.count > 0, (
+        "no emissions paired with ground truth — JSONL timestamps "
+        "outside the tlog GPS window?"
+    )
+
+    context = ReportContext(
+        run_date_utc=datetime.datetime.now(datetime.timezone.utc)
+        .date()
+        .isoformat(),
+        tlog_path=tlog_path,
+        video_path=video_path,
+        calibration_acquisition_method=_read_calibration_acquisition_method(
+            calibration_path
+        ),
+        clip_duration_s=(
+            (ground_truth[-1].t_s - ground_truth[0].t_s)
+            if ground_truth
+            else 0.0
+        ),
+        emissions_count=len(emissions),
+    )
+
+    passed = verdict_passes_ac3(distribution)
+    report_text = render_report(distribution, context, passed=passed)
+
+    report_dir = _repo_root() / "_docs" / "06_metrics"
+    report_dir.mkdir(parents=True, exist_ok=True)
+    report_path = (
+        report_dir / f"real_flight_validation_{context.run_date_utc}.md"
+    )
+    report_path.write_text(report_text)
+
+    # Assert AC-2: the report exists and contains all required rows.
+    assert report_path.is_file()
+    written = report_path.read_text()
+    assert "## Horizontal error (metres)" in written
+    assert "## Threshold-hit share" in written
+    assert "Mean" in written
+    assert "p50" in written
+    assert "p95" in written
+    assert "p99" in written
+    for threshold in (10, 25, 50, 100):
+        assert f"| {threshold} |" in written, (
+            f"threshold {threshold} m row missing from report"
+        )
+
+    # Assert AC-1 + AC-3: honest verdict. On FAIL, the message must
+    # reference the calibration method (AZ-699 AC-3).
+    if not passed:
+        msg = format_failure_message(distribution, context)
+        pytest.fail(
+            f"{msg}\n\nReport: {report_path}\n"
+            f"Run wall-clock: {wall_clock_s:.1f} s"
+        )
+
+    # PASS path — emit a concise summary so the CI log carries the
+    # numbers without forcing the operator to open the report.
+    share_100m = distribution.threshold_hit_share[AC3_GATE_THRESHOLD_M] * 100.0
+    assert share_100m >= AC3_GATE_PCT  # guarded by verdict_passes_ac3
+    print(
+        f"AZ-699 PASS — {share_100m:.1f} % within "
+        f"{AC3_GATE_THRESHOLD_M:.0f} m, mean="
+        f"{distribution.horizontal_error_mean_m:.1f} m, p95="
+        f"{distribution.horizontal_error_p95_m:.1f} m. "
+        f"Report at {report_path}",
+        file=sys.stderr,
+    )