From dcde602f618aca0e4bb11ab9c38b84eb1f2708b8 Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Wed, 20 May 2026 16:53:48 +0300 Subject: [PATCH] [AZ-699] Real-flight validation runner + Markdown accuracy report New e2e test runs gps-denied-replay --auto-trim against the real derkachi.tlog + flight video + AZ-702 calibration, computes the horizontal-error distribution (mean/p50/p95/p99 + 10/25/50/100 m threshold-hit share), writes _docs/06_metrics/real_flight_ validation_{date}.md, and asserts honest PASS/FAIL with no @xfail mask. AZ-404's 1-min test is untouched (sibling, not replacement). Extends gps_compare.py with HorizontalErrorDistribution + percentile_sorted (numpy-equivalent linear interpolation). New test helper _report_writer.py renders the canonical Markdown schema documented as FT-P-20 in blackbox-tests.md. 16 new unit tests pin distribution arithmetic, verdict gate, failure-message templating (references calibration acquisition method per AC-3), and report layout. 129 passed in focused regression, 3 skipped (real video / Tier-2 prerequisites). Zero new mypy --strict errors. Co-authored-by: Cursor --- _docs/02_document/tests/blackbox-tests.md | 79 ++++ .../AZ-699_real_flight_validation_runner.md | 47 +++ .../batch_100_cycle2_report.md | 114 ++++++ _docs/_autodev_state.md | 4 +- src/gps_denied_onboard/helpers/__init__.py | 6 + src/gps_denied_onboard/helpers/gps_compare.py | 162 +++++++++ tests/e2e/replay/_report_writer.py | 188 ++++++++++ tests/e2e/replay/test_derkachi_real_tlog.py | 321 ++++++++++++++++ tests/unit/test_az699_report_writer.py | 342 ++++++++++++++++++ 9 files changed, 1261 insertions(+), 2 deletions(-) rename _docs/02_tasks/{todo => done}/AZ-699_real_flight_validation_runner.md (59%) create mode 100644 _docs/03_implementation/batch_100_cycle2_report.md create mode 100644 tests/e2e/replay/_report_writer.py create mode 100644 tests/e2e/replay/test_derkachi_real_tlog.py create mode 100644 tests/unit/test_az699_report_writer.py diff --git a/_docs/02_document/tests/blackbox-tests.md b/_docs/02_document/tests/blackbox-tests.md index 4453101..3239005 100644 --- a/_docs/02_document/tests/blackbox-tests.md +++ b/_docs/02_document/tests/blackbox-tests.md @@ -593,3 +593,82 @@ All tests run from the `e2e-runner` container against the SUT through public bou **Expected outcome**: All mid-flight tiles current-timestamped and fresh. **Max execution time**: 6 min. + +--- + +### FT-P-20: Real-flight validation runner — honest verdict + Markdown accuracy report + +**Summary**: Runs the full `gps-denied-replay` against the **real** Derkachi binary tlog + flight video + AZ-702 factory-sheet camera calibration, computes the per-emission horizontal-error distribution, and writes a structured Markdown accuracy report. Replaces the AZ-404 `@xfail` mask on AC-3 with a real PASS/FAIL. +**Traces to**: AZ-699 AC-1..AC-3 (epic AZ-696 AC-3 — the 100 m / 80 % gate). +**Category**: Position Accuracy + +**Preconditions**: +- `_docs/00_problem/input_data/flight_derkachi/derkachi.tlog` (real binary, multi-flight). +- `_docs/00_problem/input_data/flight_derkachi/flight_derkachi.mp4` (real recording, > 1 MB; the placeholder used by AZ-404 does not satisfy this gate). +- `_docs/00_problem/input_data/flight_derkachi/khp20s30_factory.json` (AZ-702 calibration). +- `gps-denied-replay` console-script installed. +- `RUN_REPLAY_E2E=1` (matches the existing AZ-404 gate). + +**Input data**: real `derkachi.tlog` covers up to three sorties; the AZ-698 segmenter + `--auto-trim` locates the matching flight automatically. + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | Invoke `gps-denied-replay --auto-trim ...` with real fixtures | Subprocess exits 0 within the 15-min NFR budget | +| 2 | Parse JSONL emissions; pair each with the nearest-in-time ground-truth row (binary-tlog GPS via AZ-697) | Distribution computed: count, mean, p50, p95, p99, threshold-hit share at 10/25/50/100 m | +| 3 | Render the Markdown accuracy report and write `_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md` | Report exists with header, run context, horizontal-error stats, threshold-hit table, and (when available) vertical-error stats | +| 4 | Evaluate the AC-3 gate: ≥ 80 % within 100 m | Verdict is PASS or honest FAIL — no `@xfail` mask | +| 5 | On FAIL, surface a failure message referencing the calibration acquisition method (factory-sheet / placeholder / unknown) and the residual budget | Operator can attribute the failure without re-reading the source | + +**Expected outcome**: PASS when the estimator meets the epic AC-3 gate; honest FAIL otherwise. The Markdown report is the durable artefact (consumed by the cycle retrospective and downstream tuning work). + +**Max execution time**: 15 min (matches AZ-699 NFR for a single Tier-2 Jetson run). + +**Report artefact schema** (canonical, produced by `tests/e2e/replay/_report_writer.py`): + +```markdown +# Real-flight validation — YYYY-MM-DD + +**Verdict**: PASS | FAIL (AC-3 gate: ≥ 80 % within 100 m) + +## Run context + +- Tlog: `` +- Video: `` +- Calibration acquisition method: factory-sheet | placeholder | unknown +- Clip duration: s +- Emissions consumed: +- Ground-truth pairings: + +## Horizontal error (metres) + +| Statistic | Value | +| --------- | ----- | +| Mean | | +| p50 | | +| p95 | | +| p99 | | + +## Threshold-hit share + +| Threshold (m) | Hit share (%) | +| ------------- | ------------- | +| 10 | | +| 25 | | +| 50 | | +| 100 | | + +## Vertical error (metres) + +| Statistic | Value | +| --------- | ----- | +| Mean | | +| p50 | | +| p95 | | +| Samples | | +``` + +The Vertical-error section is replaced by `_No emissions carried a comparable altitude — vertical stats skipped._` when none of the JSONL rows carry an `alt_m` field comparable to the ground-truth altitude. + +**Skip semantics**: AZ-699 distinguishes between *missing-prerequisite skip* (cleanly skipped with the missing file's path) and *test-cannot-resolve mask* (`@xfail` — explicitly forbidden by AZ-699 AC-1). The AZ-404 1-min test's `@xfail` on AC-3 is unchanged (AZ-699 AC-4 is "add a new test, don't replace") — FT-P-20 is the honest replacement that runs alongside it. diff --git a/_docs/02_tasks/todo/AZ-699_real_flight_validation_runner.md b/_docs/02_tasks/done/AZ-699_real_flight_validation_runner.md similarity index 59% rename from _docs/02_tasks/todo/AZ-699_real_flight_validation_runner.md rename to _docs/02_tasks/done/AZ-699_real_flight_validation_runner.md index 2a41a8a..e98ef5f 100644 --- a/_docs/02_tasks/todo/AZ-699_real_flight_validation_runner.md +++ b/_docs/02_tasks/done/AZ-699_real_flight_validation_runner.md @@ -104,3 +104,50 @@ Then the existing `@xfail` test still exists and runs (we add, don't replace) **Risk 2: tlog format edge cases** - *Risk*: Real tlogs may carry non-standard system IDs, dialect mismatches, or corrupt segments. - *Mitigation*: AZ-697's AC-3 / AC-4 cover this at the truth-extractor level; this task only consumes the result. + +--- + +## Implementation Notes (Batch 100 — Cycle 2) + +**Status**: In Testing (Jira AZ-699). + +### Files changed + +Production: +- `src/gps_denied_onboard/helpers/gps_compare.py` — extended with `HorizontalErrorDistribution` DTO, `horizontal_error_distribution(emissions, ground_truth)` single-walk aggregator, and `percentile_sorted(values, pct)` linear-interpolation helper (numpy-equivalent). Re-exports added to `helpers/__init__.py`. + +Test helpers (under `tests/`, not production): +- `tests/e2e/replay/_report_writer.py` — `render_report`, `format_failure_message`, `verdict_passes_ac3`, plus `ReportContext` DTO and the `AC3_GATE_THRESHOLD_M` / `AC3_GATE_PCT` constants. + +Tests: +- `tests/e2e/replay/test_derkachi_real_tlog.py` — the AZ-699 e2e runner (skipped without real video + `RUN_REPLAY_E2E=1`; honest PASS/FAIL when prerequisites met). +- `tests/unit/test_az699_report_writer.py` — 16 unit tests covering percentile arithmetic, distribution aggregator, verdict gate, failure-message templating, and report layout. + +Documentation: +- `_docs/02_document/tests/blackbox-tests.md` — new entry **FT-P-20** documenting the artefact schema for `_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md`. + +### AC coverage + +| AC | Test / Artefact | Result | +|----|-----------------|--------| +| AC-1 | `test_az699_real_flight_validation_emits_verdict_and_report` | SKIPPED on dev (real video missing); ready to run on Tier-2 Jetson with `RUN_REPLAY_E2E=1` + real video. NO `@xfail` mask. | +| AC-2 | `test_render_report_contains_all_required_rows_on_pass`, `test_render_report_marks_failure_when_below_gate`, `test_render_report_includes_vertical_when_available` | PASS | +| AC-3 | `test_failure_message_references_calibration_method_factory_sheet`, `test_failure_message_references_calibration_method_placeholder` | PASS | +| AC-4 | `tests/e2e/replay/test_derkachi_1min.py` untouched (verified by diff scope: this batch added a sibling file, did not modify the existing one) | PASS | + +### Test results + +129 passed, 3 skipped (all documented prerequisites: AZ-699 e2e wants real video + Tier-2; AZ-698 AC-5 wants real video; AZ-399 AC-1 wants 500 MB tlog) in the focused regression slice. Full unit suite: 2219 passed, 1 pre-existing unrelated failure in `tests/unit/c12_operator_orchestrator/test_cli_console_script.py::test_cold_start_under_500ms_p99` (CLI cold-start NFR, 8/11 samples > 700 ms; touches none of AZ-697/698/699's code paths). + +### Strict typing + +`mypy --strict` on the three new modules (`helpers/gps_compare.py`, `helpers/__init__.py`, `tests/e2e/replay/_report_writer.py`): **Success: no issues found in 3 source files.** + +### Known limitations + +- AC-1 (the actual real-flight run on Tier-2 Jetson) cannot execute in this dev environment. The test is wired, gated cleanly, and ready — drop a real `flight_derkachi.mp4` (> 1 MB) into `_docs/00_problem/input_data/flight_derkachi/`, set `RUN_REPLAY_E2E=1`, and run on Tier-2 to produce the verdict + report. +- Calibration acquisition method is read from the JSON's `acquisition_method` field. AZ-702 ships `factory-sheet`; any other value (or absence) is labelled `unknown` in the failure message. + +### Design note: helper module location + +`gps_compare.py` lives under `src/gps_denied_onboard/helpers/` (production) because both the AZ-699 test and the future AZ-701 HTTP-API path (T5) need it. `_report_writer.py` lives under `tests/e2e/replay/` because it is purely a test artefact — promoting it would invite production code to import a test helper, violating the file ownership rule documented in `_docs/02_document/module-layout.md`. diff --git a/_docs/03_implementation/batch_100_cycle2_report.md b/_docs/03_implementation/batch_100_cycle2_report.md new file mode 100644 index 0000000..e7e5659 --- /dev/null +++ b/_docs/03_implementation/batch_100_cycle2_report.md @@ -0,0 +1,114 @@ +# Batch 100 — Cycle 2 — AZ-699 + +**Date**: 2026-05-20 +**Tasks**: AZ-699 (Real-flight validation runner + accuracy report). +**Story points**: 3. +**Jira status**: AZ-699 → `In Testing`. + +## What shipped + +An honest PASS/FAIL e2e runner for the real Derkachi flight, +together with the metric helpers, report writer, and unit tests +that make its output reproducible and reviewable. + +- `HorizontalErrorDistribution` aggregate (mean / p50 / p95 / p99 + horizontal, threshold-hit share at 10/25/50/100 m, vertical + stats when emissions carry altitude) in + `src/gps_denied_onboard/helpers/gps_compare.py`. +- `tests/e2e/replay/_report_writer.py` — Markdown report + renderer + AC-3 failure-message template + verdict helper. +- `tests/e2e/replay/test_derkachi_real_tlog.py` — runs `gps-denied-replay + --auto-trim` against real `derkachi.tlog` + real video + AZ-702 + calibration, computes the distribution, writes the report, + and asserts PASS/FAIL with no `@xfail` mask. +- New FT-P-20 entry in `_docs/02_document/tests/blackbox-tests.md` + documenting the report artefact schema. + +## Files changed + +Production (2): + +- `src/gps_denied_onboard/helpers/gps_compare.py` +- `src/gps_denied_onboard/helpers/__init__.py` + +Tests (3 new): + +- `tests/e2e/replay/_report_writer.py` +- `tests/e2e/replay/test_derkachi_real_tlog.py` +- `tests/unit/test_az699_report_writer.py` + +Docs: + +- `_docs/02_document/tests/blackbox-tests.md` (new FT-P-20) +- `_docs/02_tasks/done/AZ-699_real_flight_validation_runner.md` + (moved from `todo/`, Implementation Notes appended) + +## AC coverage + +| AC | Test / Artefact | Result | +| ---- | ------------------------------------------------------------------------------------------------------------------------ | ------ | +| AC-1 | `test_az699_real_flight_validation_emits_verdict_and_report` | SKIPPED on dev (real video missing); wired + ready for Tier-2 Jetson; NO @xfail mask. | +| AC-2 | `test_render_report_contains_all_required_rows_on_pass`, `test_render_report_marks_failure_when_below_gate` | PASS | +| AC-3 | `test_failure_message_references_calibration_method_factory_sheet`, `…placeholder` | PASS | +| AC-4 | `tests/e2e/replay/test_derkachi_1min.py` untouched | PASS | + +## Test run + +``` +tests/unit/test_az699_report_writer.py 16 PASS +tests/unit/test_az697_gps_compare.py 10 PASS +tests/unit/replay_input/test_az405_auto_sync.py 14 PASS +tests/unit/replay_input/test_az405_replay_input_adapter 13 PASS +tests/unit/replay_input/test_az698_window_alignment.py 19 PASS 1 SKIP +tests/unit/replay_input/test_tlog_ground_truth.py 12 PASS +tests/unit/c8_fc_adapter/test_az399_tlog_replay_adapter 24 PASS 1 SKIP +tests/unit/calibration/test_khp20s30_factory.py 9 PASS +tests/unit/runtime_root/test_az687_pre_constructed_replay_mode.py 3 PASS +tests/unit/test_az269_config_loader.py 9 PASS +tests/e2e/replay/test_derkachi_real_tlog.py - 1 SKIP +``` + +Focused slice: **129 passed, 3 skipped, 0 failed.** + +Full unit suite (2 220 tests): **2 219 passed, 1 failed.** The +single failure is in +`tests/unit/c12_operator_orchestrator/test_cli_console_script.py::test_cold_start_under_500ms_p99` +— a CLI cold-start NFR test (8/11 samples > 700 ms; budget is 500 ms). +The C12 binary does NOT import any AZ-697/698/699 module +(`gps_denied_onboard.components.c12_operator_orchestrator.{operator_reloc_service,flights_api.bbox}` +import specific helper submodules, not the package's `__init__`). +Pre-existing, unrelated, reported but not blocking per coderule. + +## Strict typing + +`mypy --strict` on the three new code units: + +``` +gps_denied_onboard/helpers/gps_compare.py +gps_denied_onboard/helpers/__init__.py +tests/e2e/replay/_report_writer.py +→ Success: no issues found in 3 source files. +``` + +Zero new strict errors in the broader replay/auto-sync surface +(carried over from batch 99's baseline of 12 pre-existing errors; +no new errors introduced). + +## Skip semantics — AZ-699 AC-1 spec wording + +The AZ-699 spec line 56 reads: "the result is PASS or FAIL — no +`@xfail`, no `@skip`". The spec's Constraints section line 96 reads: +"Skipping in CI when `RUN_REPLAY_E2E=0` is allowed (matches existing +pattern); the test MUST run when the env var is set." We resolved +this internal contradiction in favour of the Constraints: the test +SKIPS cleanly when a prerequisite is missing (env var unset, real +video missing or placeholder-sized, console-script not installed), +and produces an honest PASS/FAIL verdict when all prerequisites +hold. The forbidden pattern is the `@xfail` mask that AZ-404 used +to hide AC-3 — that is NOT present anywhere in AZ-699. + +## Next batch + +Batch 101 — **AZ-700** (replay map visualization). Depends on +AZ-697 (ground truth) and AZ-698 (alignment) — both now in +testing. diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 3789ba5..aa6aaf1 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -8,8 +8,8 @@ status: in_progress sub_step: phase: 6 name: implement-tasks-sequentially - detail: "batch 100 of ~102: AZ-699" + detail: "batch 101 of ~102: AZ-700" retry_count: 0 cycle: 2 tracker: jira -last_completed_batch: 99 +last_completed_batch: 100 diff --git a/src/gps_denied_onboard/helpers/__init__.py b/src/gps_denied_onboard/helpers/__init__.py index 12f6af7..13c6f33 100644 --- a/src/gps_denied_onboard/helpers/__init__.py +++ b/src/gps_denied_onboard/helpers/__init__.py @@ -18,8 +18,11 @@ from gps_denied_onboard.helpers.engine_filename_schema import ( ) from gps_denied_onboard.helpers.gps_compare import ( GroundTruthRow, + HorizontalErrorDistribution, + horizontal_error_distribution, l2_horizontal_m, match_percentage, + percentile_sorted, ) from gps_denied_onboard.helpers.imu_preintegrator import ( CombinedImuFactor, @@ -77,6 +80,7 @@ __all__ = [ "EngineFilenameSchema", "EngineFilenameSchemaError", "GroundTruthRow", + "HorizontalErrorDistribution", "ImuPreintegrationError", "ImuPreintegrator", "LightGlueConcurrentAccessError", @@ -92,6 +96,7 @@ __all__ = [ "WgsConverter", "adjoint", "exp_map", + "horizontal_error_distribution", "is_valid_rotation", "iso_ts_from_clock", "iso_ts_now", @@ -100,5 +105,6 @@ __all__ = [ "match_percentage", "make_imu_preintegrator", "matrix_to_se3", + "percentile_sorted", "se3_to_matrix", ] diff --git a/src/gps_denied_onboard/helpers/gps_compare.py b/src/gps_denied_onboard/helpers/gps_compare.py index 4a33793..d0eddc2 100644 --- a/src/gps_denied_onboard/helpers/gps_compare.py +++ b/src/gps_denied_onboard/helpers/gps_compare.py @@ -19,8 +19,11 @@ from typing import Any __all__ = [ "GroundTruthRow", + "HorizontalErrorDistribution", + "horizontal_error_distribution", "l2_horizontal_m", "match_percentage", + "percentile_sorted", ] @@ -118,3 +121,162 @@ def _bisect_left(seq: list[float], target: float) -> int: else: hi = mid return lo + + +# --------------------------------------------------------------------- +# AZ-699: accuracy distribution helpers for the real-flight runner +# --------------------------------------------------------------------- + +# Threshold-hit buckets for the AZ-696 epic's distance gates. Reported +# in this exact order so report tables stay stable across runs. +_DEFAULT_THRESHOLDS_M: tuple[float, ...] = (10.0, 25.0, 50.0, 100.0) + + +@dataclass(frozen=True) +class HorizontalErrorDistribution: + """Aggregate horizontal-error stats for one estimator run. + + All distances are great-circle metres (see :func:`l2_horizontal_m`). + ``threshold_hit_share`` keys map the buckets in + :data:`_DEFAULT_THRESHOLDS_M`; values are share-in-``[0.0, 1.0]``. + ``count`` is the number of emissions that contributed (i.e. each + emission has at least one ground-truth candidate to compare to). + + The ``vertical_*`` fields are populated when emissions carry an + ``alt_m`` component AND the matched ground-truth row carries an + altitude. Missing altitudes contribute nothing — both stats and + counts skip those samples. + + Style: this DTO is the source of truth for AZ-699's report writer + and for downstream HTTP-API / dashboard consumers. + """ + + count: int + horizontal_error_mean_m: float + horizontal_error_p50_m: float + horizontal_error_p95_m: float + horizontal_error_p99_m: float + threshold_hit_share: dict[float, float] + vertical_count: int + vertical_error_mean_m: float + vertical_error_p50_m: float + vertical_error_p95_m: float + + +def percentile_sorted(values_sorted: list[float], pct: float) -> float: + """Linear-interpolation percentile over an already-sorted list. + + ``pct`` is in ``[0.0, 100.0]``. Matches numpy's default + ``linear`` interpolation rule so the report numbers are reproducible + by anyone with numpy or scipy at hand. Empty input → ``0.0`` so + downstream report formatters never trip on a divide-by-zero; the + caller is expected to gate on ``count > 0`` for honest reporting. + """ + if not values_sorted: + return 0.0 + if pct <= 0.0: + return values_sorted[0] + if pct >= 100.0: + return values_sorted[-1] + n = len(values_sorted) + rank = (pct / 100.0) * (n - 1) + lo = int(math.floor(rank)) + hi = int(math.ceil(rank)) + if lo == hi: + return values_sorted[lo] + frac = rank - lo + return values_sorted[lo] * (1.0 - frac) + values_sorted[hi] * frac + + +def horizontal_error_distribution( + emissions: list[dict[str, Any]], + ground_truth: list[GroundTruthRow], + *, + thresholds_m: tuple[float, ...] = _DEFAULT_THRESHOLDS_M, +) -> HorizontalErrorDistribution: + """Walk all emissions once, return aggregated horizontal-error stats. + + Pairs each emission with the nearest-in-time ground-truth row using + the same bisect strategy as :func:`match_percentage`. Vertical error + is computed when both the emission record and the matched row carry + a numeric altitude. + + Empty emissions produce a zero-count distribution (every numeric + field is ``0.0``). The caller is responsible for surfacing + "no emissions" as a separate fail condition before consuming the + report — the helper itself never raises on empty input so report + formatting stays defensive. + """ + if not ground_truth: + raise AssertionError("ground_truth must be non-empty") + if not emissions: + return HorizontalErrorDistribution( + count=0, + horizontal_error_mean_m=0.0, + horizontal_error_p50_m=0.0, + horizontal_error_p95_m=0.0, + horizontal_error_p99_m=0.0, + threshold_hit_share={t: 0.0 for t in thresholds_m}, + vertical_count=0, + vertical_error_mean_m=0.0, + vertical_error_p50_m=0.0, + vertical_error_p95_m=0.0, + ) + + gt_sorted = sorted(ground_truth, key=lambda r: r.t_s) + gt_times = [r.t_s for r in gt_sorted] + + horiz_errors: list[float] = [] + vert_errors: list[float] = [] + for emit in emissions: + emit_ts_ns = int(emit["emitted_at"]) + emit_t_s = emit_ts_ns / 1e9 + idx = _bisect_left(gt_times, emit_t_s) + candidates = [] + if idx > 0: + candidates.append(gt_sorted[idx - 1]) + if idx < len(gt_sorted): + candidates.append(gt_sorted[idx]) + nearest = min(candidates, key=lambda r: abs(r.t_s - emit_t_s)) + emit_pos = emit["position_wgs84"] + horiz_errors.append( + l2_horizontal_m( + emit_pos["lat_deg"], + emit_pos["lon_deg"], + nearest.lat_deg, + nearest.lon_deg, + ) + ) + emit_alt = emit_pos.get("alt_m") + if emit_alt is not None and not math.isnan(nearest.alt_m): + vert_errors.append(abs(float(emit_alt) - nearest.alt_m)) + + horiz_sorted = sorted(horiz_errors) + horiz_mean = sum(horiz_errors) / len(horiz_errors) + thresholds_share = { + t: sum(1 for e in horiz_errors if e <= t) / len(horiz_errors) + for t in thresholds_m + } + + if vert_errors: + vert_sorted = sorted(vert_errors) + vert_mean = sum(vert_errors) / len(vert_errors) + vert_p50 = percentile_sorted(vert_sorted, 50.0) + vert_p95 = percentile_sorted(vert_sorted, 95.0) + else: + vert_mean = 0.0 + vert_p50 = 0.0 + vert_p95 = 0.0 + + return HorizontalErrorDistribution( + count=len(horiz_errors), + horizontal_error_mean_m=horiz_mean, + horizontal_error_p50_m=percentile_sorted(horiz_sorted, 50.0), + horizontal_error_p95_m=percentile_sorted(horiz_sorted, 95.0), + horizontal_error_p99_m=percentile_sorted(horiz_sorted, 99.0), + threshold_hit_share=thresholds_share, + vertical_count=len(vert_errors), + vertical_error_mean_m=vert_mean, + vertical_error_p50_m=vert_p50, + vertical_error_p95_m=vert_p95, + ) diff --git a/tests/e2e/replay/_report_writer.py b/tests/e2e/replay/_report_writer.py new file mode 100644 index 0000000..d274cd2 --- /dev/null +++ b/tests/e2e/replay/_report_writer.py @@ -0,0 +1,188 @@ +"""AZ-699 Markdown accuracy-report writer (test helper). + +Renders a :class:`HorizontalErrorDistribution` (the production +helper in ``gps_denied_onboard.helpers.gps_compare``) plus run +context (calibration acquisition method, clip duration, fixture +paths) into the canonical Markdown layout consumed by +``_docs/06_metrics/real_flight_validation_{date}.md``. + +This module lives under ``tests/`` (NOT production) — the report +is an artefact of running the AZ-699 e2e test. Promoting the +writer to ``src/`` would invite production code to import a test +helper, so the file ownership rule keeps it here. + +Style: every function is pure; the side effect (writing the file) +is the caller's. Tests in ``tests/unit/test_az699_report_writer.py`` +exercise both the rendering and the threshold-gate verdict logic. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from gps_denied_onboard.helpers.gps_compare import HorizontalErrorDistribution + +__all__ = [ + "AC3_GATE_PCT", + "AC3_GATE_THRESHOLD_M", + "ReportContext", + "format_failure_message", + "render_report", + "verdict_passes_ac3", +] + + +# AZ-696 epic AC-3 threshold + minimum-share gate. Keeping these +# named constants here (rather than inlined into the test) so the +# unit tests for the failure-message template can pin them. +AC3_GATE_THRESHOLD_M: float = 100.0 +AC3_GATE_PCT: float = 80.0 + + +@dataclass(frozen=True) +class ReportContext: + """Run context surfaced in the report header. + + Attributes: + run_date_utc: ISO-8601 date (YYYY-MM-DD) at which the run + executed — drives the report filename. + tlog_path: Real tlog the runner consumed. + video_path: Video clip the runner consumed. + calibration_acquisition_method: Provenance of the camera + calibration (e.g. ``"factory-sheet"`` for AZ-702 or + ``"placeholder"`` for the adti26 fallback). Surfaced in + the failure message per AZ-699 AC-3. + clip_duration_s: Duration of the analysed clip in seconds. + emissions_count: Total estimator-output records consumed + from the JSONL (may differ from + ``distribution.count`` when some emissions land + outside the GT window). + """ + + run_date_utc: str + tlog_path: Path + video_path: Path + calibration_acquisition_method: str + clip_duration_s: float + emissions_count: int + + +def verdict_passes_ac3(distribution: HorizontalErrorDistribution) -> bool: + """Return ``True`` when the run meets AZ-696 epic AC-3.""" + if distribution.count == 0: + return False + share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M) + if share is None: + return False + return share * 100.0 >= AC3_GATE_PCT + + +def format_failure_message( + distribution: HorizontalErrorDistribution, + context: ReportContext, +) -> str: + """Build the honest failure message for AZ-699 AC-3. + + The message references the calibration acquisition method + (factory-sheet for AZ-702 or placeholder otherwise) and the + measured residual budget, so the operator can attribute a + failure to its likely root cause (calibration uncertainty, + drift, anchor scarcity) without re-reading the source. + """ + share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M, 0.0) + pct = share * 100.0 + return ( + f"AZ-699 AC-3: only {pct:.1f} % of {distribution.count} " + f"emissions within {AC3_GATE_THRESHOLD_M:.0f} m of ground " + f"truth; epic threshold is {AC3_GATE_PCT:.0f} %. " + f"Residual: mean={distribution.horizontal_error_mean_m:.1f} m, " + f"p50={distribution.horizontal_error_p50_m:.1f} m, " + f"p95={distribution.horizontal_error_p95_m:.1f} m, " + f"p99={distribution.horizontal_error_p99_m:.1f} m. " + f"Calibration: {context.calibration_acquisition_method}. " + "See _docs/06_metrics/real_flight_validation_" + f"{context.run_date_utc}.md for the full distribution." + ) + + +def render_report( + distribution: HorizontalErrorDistribution, + context: ReportContext, + *, + passed: bool, +) -> str: + """Render the full Markdown report body. + + The output layout (header + horizontal-error stats + threshold + table + vertical-error stats + verdict) is the schema referenced + by ``_docs/02_document/tests/blackbox-tests.md``. + """ + verdict = "PASS" if passed else "FAIL" + horiz_rows = [ + ("Mean", distribution.horizontal_error_mean_m), + ("p50", distribution.horizontal_error_p50_m), + ("p95", distribution.horizontal_error_p95_m), + ("p99", distribution.horizontal_error_p99_m), + ] + threshold_rows = [ + (t, share) + for t, share in sorted(distribution.threshold_hit_share.items()) + ] + + lines: list[str] = [] + lines.append(f"# Real-flight validation — {context.run_date_utc}") + lines.append("") + lines.append(f"**Verdict**: {verdict} (AC-3 gate: " + f"≥ {AC3_GATE_PCT:.0f} % within " + f"{AC3_GATE_THRESHOLD_M:.0f} m)") + lines.append("") + lines.append("## Run context") + lines.append("") + lines.append(f"- Tlog: `{context.tlog_path}`") + lines.append(f"- Video: `{context.video_path}`") + lines.append( + f"- Calibration acquisition method: {context.calibration_acquisition_method}" + ) + lines.append(f"- Clip duration: {context.clip_duration_s:.1f} s") + lines.append(f"- Emissions consumed: {context.emissions_count}") + lines.append(f"- Ground-truth pairings: {distribution.count}") + lines.append("") + lines.append("## Horizontal error (metres)") + lines.append("") + lines.append("| Statistic | Value |") + lines.append("| --------- | ----- |") + for name, value in horiz_rows: + lines.append(f"| {name} | {value:.2f} |") + lines.append("") + lines.append("## Threshold-hit share") + lines.append("") + lines.append("| Threshold (m) | Hit share (%) |") + lines.append("| ------------- | ------------- |") + for threshold, share in threshold_rows: + lines.append(f"| {threshold:g} | {share * 100.0:.1f} |") + lines.append("") + if distribution.vertical_count > 0: + lines.append("## Vertical error (metres)") + lines.append("") + lines.append("| Statistic | Value |") + lines.append("| --------- | ----- |") + lines.append( + f"| Mean | {distribution.vertical_error_mean_m:.2f} |" + ) + lines.append( + f"| p50 | {distribution.vertical_error_p50_m:.2f} |" + ) + lines.append( + f"| p95 | {distribution.vertical_error_p95_m:.2f} |" + ) + lines.append( + f"| Samples | {distribution.vertical_count} |" + ) + lines.append("") + else: + lines.append("## Vertical error") + lines.append("") + lines.append("_No emissions carried a comparable altitude — vertical stats skipped._") + lines.append("") + return "\n".join(lines) + "\n" diff --git a/tests/e2e/replay/test_derkachi_real_tlog.py b/tests/e2e/replay/test_derkachi_real_tlog.py new file mode 100644 index 0000000..ae6207d --- /dev/null +++ b/tests/e2e/replay/test_derkachi_real_tlog.py @@ -0,0 +1,321 @@ +"""AZ-699 — Real-flight validation runner + accuracy report. + +Runs the ``gps-denied-replay`` console-script against the **real** +``derkachi.tlog`` (binary, not the AZ-265 synth) plus the real +flight video and the AZ-702 factory-sheet camera calibration, +computes the full horizontal-error distribution, and writes a +structured Markdown accuracy report to +``_docs/06_metrics/real_flight_validation_{YYYY-MM-DD}.md``. + +Unlike the AZ-404 1-min test (``test_derkachi_1min.py``), this one +emits an **honest** PASS/FAIL verdict — no ``@xfail`` mask. The +threshold gate is ≥ 80 % of emissions within 100 m (AZ-696 epic +AC-3). When the run fails, the failure message references the +calibration acquisition method (factory-sheet for AZ-702 or the +``adti26.json`` placeholder) so the root cause is attributable +without re-reading the source. + +The test SKIPS (not ``@xfail``) when any prerequisite is absent: + +* ``RUN_REPLAY_E2E`` env var not set (matches AZ-404 gating). +* Real ``derkachi.tlog`` binary missing. +* Real ``flight_derkachi.mp4`` missing OR placeholder-sized + (the repo currently ships a 134-byte placeholder for the + AZ-404 fixture; this AZ-699 path requires a decodable video). +* ``gps-denied-replay`` console-script not installed. + +Style: every test follows the Arrange / Act / Assert pattern. +""" + +from __future__ import annotations + +import datetime +import json +import os +import shutil +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +import pytest + +from gps_denied_onboard.helpers.gps_compare import ( + GroundTruthRow, + horizontal_error_distribution, +) +from gps_denied_onboard.replay_input import load_tlog_ground_truth +from tests.e2e.replay._report_writer import ( + AC3_GATE_PCT, + AC3_GATE_THRESHOLD_M, + ReportContext, + format_failure_message, + render_report, + verdict_passes_ac3, +) + + +# ---------------------------------------------------------------------- +# Skip gates + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def _derkachi_dir() -> Path: + return _repo_root() / "_docs" / "00_problem" / "input_data" / "flight_derkachi" + + +# Real video must be more than the placeholder size to actually +# decode. The current placeholder is 134 bytes; bump the floor to +# 1 MB so any real recording trips the gate but the placeholder +# never does. +_MIN_REAL_VIDEO_BYTES: int = 1_000_000 + + +def _real_inputs_present() -> tuple[bool, str]: + derkachi = _derkachi_dir() + tlog = derkachi / "derkachi.tlog" + video = derkachi / "flight_derkachi.mp4" + calibration = derkachi / "khp20s30_factory.json" + if not tlog.is_file(): + return False, f"real tlog missing: {tlog}" + if not video.is_file(): + return False, f"real video missing: {video}" + if video.stat().st_size < _MIN_REAL_VIDEO_BYTES: + return ( + False, + f"video at {video} is only {video.stat().st_size} bytes — " + "placeholder, not a real recording. Drop the real " + "flight_derkachi.mp4 into the fixture dir to enable AZ-699.", + ) + if not calibration.is_file(): + return False, ( + f"calibration missing: {calibration}; ship the AZ-702 " + "factory-sheet calibration to enable AZ-699." + ) + return True, "" + + +def _heavy_skip_reason() -> str | None: + if os.environ.get("RUN_REPLAY_E2E", "").lower() not in { + "1", + "true", + "yes", + "on", + }: + return "AZ-699 e2e test gated by RUN_REPLAY_E2E=1" + return None + + +def _binary_present() -> tuple[bool, str | None]: + binary = shutil.which("gps-denied-replay") + if binary is not None: + return True, binary + venv_bin = Path(sys.executable).parent / "gps-denied-replay" + if venv_bin.exists(): + return True, str(venv_bin) + return False, None + + +# ---------------------------------------------------------------------- +# Helpers + + +def _read_calibration_acquisition_method(calibration_path: Path) -> str: + """Extract the ``acquisition_method`` field set by AZ-702. + + AZ-702 documents that the camera calibration JSON carries an + ``acquisition_method`` string ("factory-sheet" for the + KHP20S30 factory deliverable). When the field is absent we + label the run as ``"unknown"`` so the failure message stays + honest — pretending the calibration is factory-sheet when it + is actually a placeholder would mis-attribute the residual. + """ + try: + data = json.loads(calibration_path.read_text()) + except (OSError, json.JSONDecodeError): + return "unknown" + method = data.get("acquisition_method") + if isinstance(method, str) and method: + return method + return "unknown" + + +def _parse_jsonl(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for line in path.read_text().splitlines(): + if not line.strip(): + continue + rows.append(json.loads(line)) + return rows + + +def _load_full_ground_truth(tlog_path: Path) -> list[GroundTruthRow]: + series = load_tlog_ground_truth(tlog_path).records + return [ + GroundTruthRow( + t_s=fix.ts_ns / 1e9, + lat_deg=fix.lat_deg, + lon_deg=fix.lon_deg, + alt_m=fix.alt_m, + ) + for fix in series + ] + + +# ---------------------------------------------------------------------- +# AC-1 + AC-2 + AC-3: full real-flight run + honest verdict + report + + +@pytest.mark.tier2 +def test_az699_real_flight_validation_emits_verdict_and_report( + tmp_path: Path, +) -> None: + # Arrange: gate cleanly on missing prerequisites (no xfail). + heavy_reason = _heavy_skip_reason() + if heavy_reason is not None: + pytest.skip(heavy_reason) + inputs_ok, inputs_reason = _real_inputs_present() + if not inputs_ok: + pytest.skip(inputs_reason) + binary_ok, binary = _binary_present() + if not binary_ok or binary is None: + pytest.skip("gps-denied-replay console-script not installed") + + derkachi = _derkachi_dir() + tlog_path = derkachi / "derkachi.tlog" + video_path = derkachi / "flight_derkachi.mp4" + calibration_path = derkachi / "khp20s30_factory.json" + output_path = tmp_path / "estimator_output.jsonl" + + # Signing-key stub (NoopMavlinkTransport ignores bytes; path + # validation just needs the file to exist). + signing_key_path = tmp_path / "signing_key.bin" + signing_key_path.write_bytes(b"\x00" * 32) + + config_path = tmp_path / "config.yaml" + config_path.write_text( + "mode: replay\n" + "replay:\n" + " pace: asap\n" + " target_fc_dialect: ardupilot_plane\n" + ) + + argv = [ + binary, + "--video", + str(video_path), + "--tlog", + str(tlog_path), + "--output", + str(output_path), + "--camera-calibration", + str(calibration_path), + "--config", + str(config_path), + "--mavlink-signing-key", + str(signing_key_path), + "--pace", + "asap", + # AZ-698: let the segmenter + NCC find the last flight + # window in the multi-flight tlog. No manual offset. + "--auto-trim", + ] + + # Act + t0 = time.monotonic() + completed = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=900, # 15-min NFR budget per AZ-699 spec + ) + wall_clock_s = time.monotonic() - t0 + + assert completed.returncode == 0, ( + f"gps-denied-replay exited {completed.returncode}\n" + f"stdout:\n{completed.stdout}\nstderr:\n{completed.stderr}" + ) + assert wall_clock_s <= 900.0, ( + f"AZ-699 NFR: real-flight run took {wall_clock_s:.1f} s; " + "budget is 15 min" + ) + + emissions = _parse_jsonl(output_path) + assert emissions, "no JSONL emissions produced — pipeline failure" + + ground_truth = _load_full_ground_truth(tlog_path) + assert ground_truth, "ground-truth extraction yielded zero rows" + distribution = horizontal_error_distribution( + emissions, ground_truth + ) + assert distribution.count > 0, ( + "no emissions paired with ground truth — JSONL timestamps " + "outside the tlog GPS window?" + ) + + context = ReportContext( + run_date_utc=datetime.datetime.now(datetime.timezone.utc) + .date() + .isoformat(), + tlog_path=tlog_path, + video_path=video_path, + calibration_acquisition_method=_read_calibration_acquisition_method( + calibration_path + ), + clip_duration_s=( + (ground_truth[-1].t_s - ground_truth[0].t_s) + if ground_truth + else 0.0 + ), + emissions_count=len(emissions), + ) + + passed = verdict_passes_ac3(distribution) + report_text = render_report(distribution, context, passed=passed) + + report_dir = _repo_root() / "_docs" / "06_metrics" + report_dir.mkdir(parents=True, exist_ok=True) + report_path = ( + report_dir / f"real_flight_validation_{context.run_date_utc}.md" + ) + report_path.write_text(report_text) + + # Assert AC-2: the report exists and contains all required rows. + assert report_path.is_file() + written = report_path.read_text() + assert "## Horizontal error (metres)" in written + assert "## Threshold-hit share" in written + assert "Mean" in written + assert "p50" in written + assert "p95" in written + assert "p99" in written + for threshold in (10, 25, 50, 100): + assert f"| {threshold} |" in written, ( + f"threshold {threshold} m row missing from report" + ) + + # Assert AC-1 + AC-3: honest verdict. On FAIL, the message must + # reference the calibration method (AZ-699 AC-3). + if not passed: + msg = format_failure_message(distribution, context) + pytest.fail( + f"{msg}\n\nReport: {report_path}\n" + f"Run wall-clock: {wall_clock_s:.1f} s" + ) + + # PASS path — emit a concise summary so the CI log carries the + # numbers without forcing the operator to open the report. + share_100m = distribution.threshold_hit_share[AC3_GATE_THRESHOLD_M] * 100.0 + assert share_100m >= AC3_GATE_PCT # guarded by verdict_passes_ac3 + print( + f"AZ-699 PASS — {share_100m:.1f} % within " + f"{AC3_GATE_THRESHOLD_M:.0f} m, mean=" + f"{distribution.horizontal_error_mean_m:.1f} m, p95=" + f"{distribution.horizontal_error_p95_m:.1f} m. " + f"Report at {report_path}", + file=sys.stderr, + ) diff --git a/tests/unit/test_az699_report_writer.py b/tests/unit/test_az699_report_writer.py new file mode 100644 index 0000000..4aa8026 --- /dev/null +++ b/tests/unit/test_az699_report_writer.py @@ -0,0 +1,342 @@ +"""AZ-699 — Report writer + accuracy-distribution unit tests. + +Covers AZ-699 AC-2 (report layout) and AC-3 (failure-message +templating). The blackbox AC-1 (honest PASS/FAIL on real Derkachi +inputs) lives in ``tests/e2e/replay/test_derkachi_real_tlog.py`` +and is skipped without the real video + ``RUN_REPLAY_E2E=1``. + +Style: every test follows the Arrange / Act / Assert pattern. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from gps_denied_onboard.helpers.gps_compare import ( + GroundTruthRow, + HorizontalErrorDistribution, + horizontal_error_distribution, + percentile_sorted, +) +from tests.e2e.replay._report_writer import ( + AC3_GATE_PCT, + AC3_GATE_THRESHOLD_M, + ReportContext, + format_failure_message, + render_report, + verdict_passes_ac3, +) + + +# --------------------------------------------------------------------- +# percentile_sorted: pin numpy-equivalent linear interpolation + + +def test_percentile_sorted_min_max_and_p50_on_odd_length() -> None: + # Arrange + values = [1.0, 2.0, 3.0, 4.0, 5.0] + + # Act / Assert + assert percentile_sorted(values, 0.0) == 1.0 + assert percentile_sorted(values, 100.0) == 5.0 + assert percentile_sorted(values, 50.0) == 3.0 + + +def test_percentile_sorted_p95_linear_interpolation() -> None: + # Arrange + values = list(range(0, 101)) # 0..100 + + # Act + p95 = percentile_sorted(values, 95.0) + + # Assert: numpy's linear interpolation puts p95 at index 95 of + # [0..100] → value = 95. + assert p95 == pytest.approx(95.0, abs=1e-9) + + +def test_percentile_sorted_empty_returns_zero() -> None: + # Act / Assert + assert percentile_sorted([], 50.0) == 0.0 + + +# --------------------------------------------------------------------- +# horizontal_error_distribution: walk emissions once, return aggregate + + +def test_distribution_zero_error_at_collocated_points() -> None: + # Arrange: estimator emissions perfectly match ground-truth rows. + gt = [ + GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0), + GroundTruthRow(t_s=1.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0), + ] + emissions = [ + { + "emitted_at": 0, + "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0}, + }, + { + "emitted_at": 1_000_000_000, + "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0, "alt_m": 100.0}, + }, + ] + + # Act + dist = horizontal_error_distribution(emissions, gt) + + # Assert + assert dist.count == 2 + assert dist.horizontal_error_mean_m == pytest.approx(0.0, abs=1e-6) + assert dist.horizontal_error_p99_m == pytest.approx(0.0, abs=1e-6) + for threshold in (10.0, 25.0, 50.0, 100.0): + assert dist.threshold_hit_share[threshold] == pytest.approx(1.0) + + +def test_distribution_threshold_hit_share_partials() -> None: + # Arrange: 4 emissions at increasing offsets from a single GT + # point. ~111 m per 0.001 deg lat near the equator → produce + # one each in the < 10, < 25, < 50, < 100 buckets so the + # threshold counters are unambiguous. + gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] + # Distances are dominated by latitude offsets. Use deltas that + # bracket the buckets (5 m, 20 m, 40 m, 80 m). + emissions = [ + { + "emitted_at": i, + "position_wgs84": { + "lat_deg": 50.0 + (offset_m / 111_000.0), + "lon_deg": 30.0, + "alt_m": 100.0, + }, + } + for i, offset_m in enumerate((5.0, 20.0, 40.0, 80.0)) + ] + + # Act + dist = horizontal_error_distribution(emissions, gt) + + # Assert + assert dist.count == 4 + assert dist.threshold_hit_share[10.0] == pytest.approx(0.25) # only 5 m + assert dist.threshold_hit_share[25.0] == pytest.approx(0.50) # + 20 m + assert dist.threshold_hit_share[50.0] == pytest.approx(0.75) # + 40 m + assert dist.threshold_hit_share[100.0] == pytest.approx(1.0) # + 80 m + + +def test_distribution_vertical_skipped_when_alt_absent() -> None: + # Arrange: emissions without ``alt_m`` produce zero vertical + # samples; horizontal stats still compute. + gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] + emissions = [ + { + "emitted_at": 0, + "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0}, + } + ] + + # Act + dist = horizontal_error_distribution(emissions, gt) + + # Assert + assert dist.vertical_count == 0 + assert dist.vertical_error_mean_m == 0.0 + assert dist.count == 1 + + +def test_distribution_empty_emissions_returns_zero_dist() -> None: + # Arrange + gt = [GroundTruthRow(t_s=0.0, lat_deg=50.0, lon_deg=30.0, alt_m=100.0)] + + # Act + dist = horizontal_error_distribution([], gt) + + # Assert + assert dist.count == 0 + assert dist.horizontal_error_mean_m == 0.0 + for share in dist.threshold_hit_share.values(): + assert share == 0.0 + + +def test_distribution_raises_on_empty_ground_truth() -> None: + # Arrange + emissions = [ + { + "emitted_at": 0, + "position_wgs84": {"lat_deg": 50.0, "lon_deg": 30.0}, + } + ] + + # Act / Assert + with pytest.raises(AssertionError): + horizontal_error_distribution(emissions, []) + + +# --------------------------------------------------------------------- +# Verdict + failure-message: AC-3 honesty + + +def _passing_distribution() -> HorizontalErrorDistribution: + return HorizontalErrorDistribution( + count=100, + horizontal_error_mean_m=20.0, + horizontal_error_p50_m=18.0, + horizontal_error_p95_m=70.0, + horizontal_error_p99_m=95.0, + threshold_hit_share={10.0: 0.2, 25.0: 0.6, 50.0: 0.78, 100.0: 0.92}, + vertical_count=0, + vertical_error_mean_m=0.0, + vertical_error_p50_m=0.0, + vertical_error_p95_m=0.0, + ) + + +def _failing_distribution() -> HorizontalErrorDistribution: + return HorizontalErrorDistribution( + count=200, + horizontal_error_mean_m=180.0, + horizontal_error_p50_m=140.0, + horizontal_error_p95_m=420.0, + horizontal_error_p99_m=580.0, + threshold_hit_share={10.0: 0.05, 25.0: 0.18, 50.0: 0.31, 100.0: 0.45}, + vertical_count=0, + vertical_error_mean_m=0.0, + vertical_error_p50_m=0.0, + vertical_error_p95_m=0.0, + ) + + +def _context(method: str = "factory-sheet") -> ReportContext: + return ReportContext( + run_date_utc="2026-05-20", + tlog_path=Path("/tmp/derkachi.tlog"), + video_path=Path("/tmp/flight_derkachi.mp4"), + calibration_acquisition_method=method, + clip_duration_s=180.0, + emissions_count=200, + ) + + +def test_verdict_passes_when_at_least_80pct_within_100m() -> None: + # Act / Assert + assert verdict_passes_ac3(_passing_distribution()) is True + assert verdict_passes_ac3(_failing_distribution()) is False + + +def test_verdict_fails_on_zero_count() -> None: + # Arrange + empty = HorizontalErrorDistribution( + count=0, + horizontal_error_mean_m=0.0, + horizontal_error_p50_m=0.0, + horizontal_error_p95_m=0.0, + horizontal_error_p99_m=0.0, + threshold_hit_share={100.0: 0.0}, + vertical_count=0, + vertical_error_mean_m=0.0, + vertical_error_p50_m=0.0, + vertical_error_p95_m=0.0, + ) + + # Act / Assert + assert verdict_passes_ac3(empty) is False + + +def test_failure_message_references_calibration_method_factory_sheet() -> None: + # Act + msg = format_failure_message( + _failing_distribution(), + _context(method="factory-sheet"), + ) + + # Assert: AZ-699 AC-3 — failure message attributes residual to the + # calibration acquisition method and surfaces the budget. + assert "AZ-699 AC-3" in msg + assert "45.0 %" in msg # threshold-hit share at 100 m × 100 = 45.0 + assert "100 m" in msg + assert f"{AC3_GATE_PCT:.0f} %" in msg + assert "Calibration: factory-sheet" in msg + assert "mean=180.0 m" in msg + assert "p95=420.0 m" in msg + assert "p99=580.0 m" in msg + + +def test_failure_message_references_calibration_method_placeholder() -> None: + # Act + msg = format_failure_message( + _failing_distribution(), + _context(method="placeholder"), + ) + + # Assert + assert "Calibration: placeholder" in msg + + +# --------------------------------------------------------------------- +# Render report: AC-2 — every required row present + + +def test_render_report_contains_all_required_rows_on_pass() -> None: + # Act + text = render_report( + _passing_distribution(), _context(), passed=True + ) + + # Assert + assert text.startswith("# Real-flight validation — 2026-05-20") + assert "**Verdict**: PASS" in text + assert "## Horizontal error (metres)" in text + assert "## Threshold-hit share" in text + assert "Calibration acquisition method: factory-sheet" in text + for stat in ("Mean", "p50", "p95", "p99"): + assert f"| {stat} |" in text + for threshold in (10, 25, 50, 100): + assert f"| {threshold} |" in text + + +def test_render_report_marks_failure_when_below_gate() -> None: + # Act + text = render_report( + _failing_distribution(), _context(), passed=False + ) + + # Assert + assert "**Verdict**: FAIL" in text + # Threshold row for 100 m must show ~ 45 % (failing distribution). + assert "| 100 | 45.0 |" in text + + +def test_render_report_includes_vertical_when_available() -> None: + # Arrange + dist = HorizontalErrorDistribution( + count=10, + horizontal_error_mean_m=10.0, + horizontal_error_p50_m=10.0, + horizontal_error_p95_m=10.0, + horizontal_error_p99_m=10.0, + threshold_hit_share={10.0: 1.0, 25.0: 1.0, 50.0: 1.0, 100.0: 1.0}, + vertical_count=10, + vertical_error_mean_m=4.5, + vertical_error_p50_m=4.0, + vertical_error_p95_m=8.5, + ) + + # Act + text = render_report(dist, _context(), passed=True) + + # Assert + assert "## Vertical error (metres)" in text + assert "| Mean | 4.50 |" in text + assert "| p95 | 8.50 |" in text + assert "| Samples | 10 |" in text + + +def test_render_report_marks_vertical_skipped_when_no_samples() -> None: + # Arrange + dist = _passing_distribution() # vertical_count == 0 + + # Act + text = render_report(dist, _context(), passed=True) + + # Assert + assert "_No emissions carried a comparable altitude" in text