[AZ-432] [AZ-433] [AZ-434] [AZ-435] Add NFT-RES-01..04 resilience scenarios

Batch 86: 4 NFT-RES blackbox scenarios + 4 helper evaluators + 74 unit tests + directory-layout registration. * AZ-432 NFT-RES-01: 30 s IMU-only fallback drift bound (AC-3.5 + AC-NEW-7); two sub-cases (no_imu ≤100m, good_imu_combined_factor ≤50m). * AZ-433 NFT-RES-02: companion mid-flight reboot (AC-5.2 + AC-5.3); resume ≤30s + first-emission accuracy ≤100m. * AZ-434 NFT-RES-03: 100-iteration Monte Carlo envelope (AC-NEW-4); iteration-count + master-seed determinism + envelope ratio ≥0.95. Canonical-param by default; E2E_NFT_RES_03_FULL_MATRIX=1 unlocks matrix. * AZ-435 NFT-RES-04: 35s blackout+spoof escalation ladder (AC-NEW-8); AC-1 (cov-2d→fix-degrade ≤500ms) + AC-2 (failsafe→999+STATUSTEXT ≤500ms) + AC-ORDER (strict ordering). Verdict: PASS_WITH_WARNINGS (0 Critical, 0 High, 0 Medium, 5 Low). F5 documents intentional threshold duplication with blackout_spoof evaluator (prevents contract drift between FT-N-04 and NFT-RES-04). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 12:01:13 +00:00 · 2026-05-17 17:09:04 +03:00
parent 23640a784f
commit 330893be5c
15 changed files with 3325 additions and 0 deletions
@@ -0,0 +1,278 @@
+"""Unit tests for ``runner.helpers.companion_reboot_evaluator`` (AZ-433 / NFT-RES-02)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import companion_reboot_evaluator as cre
+
+REF_LAT = 49.9
+REF_LON = 36.3
+
+
+def _offset_lat(lat: float, north_m: float) -> float:
+    return lat + north_m / 111_320.0
+
+
+def _evidence(
+    *,
+    restart_command_ms: int = 60_000,
+    process_restarted_ms: int | None = 62_000,
+    first_emission_ms: int | None = 80_000,
+    estimate_offset_m: float | None = 0.0,
+    gt_offset_m: float | None = 0.0,
+) -> cre.RestartEvidence:
+    estimate = (
+        None
+        if estimate_offset_m is None
+        else cre.GeoFix(
+            monotonic_ms=first_emission_ms or 0,
+            lat_deg=_offset_lat(REF_LAT, estimate_offset_m),
+            lon_deg=REF_LON,
+        )
+    )
+    gt = (
+        None
+        if gt_offset_m is None
+        else cre.GeoFix(
+            monotonic_ms=first_emission_ms or 0,
+            lat_deg=_offset_lat(REF_LAT, gt_offset_m),
+            lon_deg=REF_LON,
+        )
+    )
+    return cre.RestartEvidence(
+        restart_command_monotonic_ms=restart_command_ms,
+        process_restarted_monotonic_ms=process_restarted_ms,
+        first_post_restart_emission_monotonic_ms=first_emission_ms,
+        first_post_restart_estimate=estimate,
+        ground_truth_at_first_emission=gt,
+    )
+
+
+# ───────────────────────── happy path ─────────────────────────
+
+
+def test_evaluate_clean_run_passes_all_acs() -> None:
+    # Arrange — 2 s restart latency, 20 s resume, 50 m drift.
+    evidence = _evidence(
+        process_restarted_ms=62_000,
+        first_emission_ms=80_000,
+        estimate_offset_m=50.0,
+        gt_offset_m=0.0,
+    )
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.restart_trigger_latency_s == pytest.approx(2.0)
+    assert report.resume_time_s == pytest.approx(20.0)
+    assert report.first_emission_accuracy_m == pytest.approx(50.0, abs=0.5)
+    assert report.passes_restart_trigger
+    assert report.passes_resume_time
+    assert report.passes_first_emission_accuracy
+    assert report.passes
+
+
+# ───────────────────────── AC-1: restart trigger ─────────────────────────
+
+
+def test_ac1_at_budget_passes() -> None:
+    # Arrange — restart latency = exactly 5 s.
+    evidence = _evidence(process_restarted_ms=60_000 + 5000)
+
+    # Act + Assert
+    assert cre.evaluate(evidence).passes_restart_trigger
+
+
+def test_ac1_above_budget_fails() -> None:
+    # Arrange
+    evidence = _evidence(process_restarted_ms=60_000 + 6000)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.restart_trigger_latency_s == pytest.approx(6.0)
+    assert not report.passes_restart_trigger
+    assert not report.passes
+
+
+def test_ac1_process_never_restarts_fails() -> None:
+    # Arrange
+    evidence = _evidence(process_restarted_ms=None)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.restart_trigger_latency_s is None
+    assert not report.passes_restart_trigger
+    assert not report.passes
+
+
+def test_ac1_negative_latency_raises() -> None:
+    # Arrange — clock skew bug.
+    evidence = _evidence(process_restarted_ms=60_000 - 1000)
+
+    # Assert
+    with pytest.raises(ValueError):
+        cre.evaluate(evidence)
+
+
+# ───────────────────────── AC-2: resume time ─────────────────────────
+
+
+def test_ac2_at_budget_passes() -> None:
+    # Arrange — resume = exactly 30 s.
+    evidence = _evidence(first_emission_ms=60_000 + 30_000)
+
+    # Act + Assert
+    assert cre.evaluate(evidence).passes_resume_time
+
+
+def test_ac2_above_budget_fails() -> None:
+    # Arrange — resume = 31 s.
+    evidence = _evidence(first_emission_ms=60_000 + 31_000)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.resume_time_s == pytest.approx(31.0)
+    assert not report.passes_resume_time
+
+
+def test_ac2_no_emission_fails() -> None:
+    # Arrange
+    evidence = _evidence(first_emission_ms=None, estimate_offset_m=None, gt_offset_m=None)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.resume_time_s is None
+    assert not report.passes_resume_time
+    assert not report.passes
+
+
+def test_ac2_negative_resume_raises() -> None:
+    # Arrange — emission before restart command.
+    evidence = _evidence(first_emission_ms=59_000)
+
+    # Assert
+    with pytest.raises(ValueError):
+        cre.evaluate(evidence)
+
+
+# ───────────────────────── AC-3: first-emission accuracy ─────────────────────────
+
+
+def test_ac3_within_budget_passes() -> None:
+    # Arrange — 80 m drift, budget 100 m.
+    evidence = _evidence(estimate_offset_m=80.0, gt_offset_m=0.0)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.first_emission_accuracy_m == pytest.approx(80.0, abs=0.5)
+    assert report.passes_first_emission_accuracy
+
+
+def test_ac3_at_budget_passes() -> None:
+    # Arrange — exactly 100 m.
+    evidence = _evidence(estimate_offset_m=100.0, gt_offset_m=0.0)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.passes_first_emission_accuracy
+
+
+def test_ac3_above_budget_fails() -> None:
+    # Arrange — 120 m drift.
+    evidence = _evidence(estimate_offset_m=120.0, gt_offset_m=0.0)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert not report.passes_first_emission_accuracy
+    assert not report.passes
+
+
+def test_ac3_missing_estimate_fails() -> None:
+    # Arrange
+    evidence = _evidence(estimate_offset_m=None)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.first_emission_accuracy_m is None
+    assert not report.passes_first_emission_accuracy
+
+
+def test_ac3_missing_gt_fails() -> None:
+    # Arrange
+    evidence = _evidence(gt_offset_m=None)
+
+    # Act
+    report = cre.evaluate(evidence)
+
+    # Assert
+    assert report.first_emission_accuracy_m is None
+    assert not report.passes_first_emission_accuracy
+
+
+# ───────────────────────── custom budgets ─────────────────────────
+
+
+def test_custom_budgets_apply() -> None:
+    # Arrange — 8 s restart latency (above default 5s) but custom 10s budget passes.
+    evidence = _evidence(process_restarted_ms=68_000)
+
+    # Act
+    report = cre.evaluate(evidence, restart_trigger_budget_s=10.0)
+
+    # Assert
+    assert report.restart_trigger_budget_s == 10.0
+    assert report.passes_restart_trigger
+
+
+# ───────────────────────── csv emit ─────────────────────────
+
+
+def test_write_csv_evidence_emits_summary(tmp_path: Path) -> None:
+    # Arrange
+    evidence = _evidence(estimate_offset_m=50.0, gt_offset_m=0.0)
+    report = cre.evaluate(evidence)
+    out_path = tmp_path / "nft-res-02.csv"
+
+    # Act
+    cre.write_csv_evidence(out_path, report)
+
+    # Assert
+    rows = out_path.read_text().splitlines()
+    assert len(rows) == 2
+    assert rows[0].startswith("restart_trigger_latency_s")
+    assert "ac1_passes" in rows[0]
+    assert "ac2_passes" in rows[0]
+    assert "ac3_passes" in rows[0]
+
+
+def test_write_csv_evidence_creates_parent_dir(tmp_path: Path) -> None:
+    # Arrange
+    evidence = _evidence()
+    report = cre.evaluate(evidence)
+    out_path = tmp_path / "deep" / "nest" / "out.csv"
+
+    # Act
+    cre.write_csv_evidence(out_path, report)
+
+    # Assert
+    assert out_path.exists()
@@ -0,0 +1,348 @@
+"""Unit tests for ``runner.helpers.escalation_ladder_evaluator`` (AZ-435 / NFT-RES-04)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import escalation_ladder_evaluator as ele
+
+
+def _window(duration_s: float = 35.0, onset_ms: int = 60_000) -> ele.BlackoutWindow:
+    return ele.BlackoutWindow(
+        onset_monotonic_ms=onset_ms,
+        end_monotonic_ms=onset_ms + int(duration_s * 1000),
+    )
+
+
+def _est(t_ms: int, *, cov: float, horiz: float = 5.0, fix_type: int = 3) -> ele.EstimateSample:
+    return ele.EstimateSample(
+        monotonic_ms=t_ms,
+        cov_semi_major_m=cov,
+        horiz_accuracy=horiz,
+        fix_type=fix_type,
+    )
+
+
+def _st(t_ms: int, text: str) -> ele.StatustextSample:
+    return ele.StatustextSample(monotonic_ms=t_ms, text=text)
+
+
+# ───────────────────────── BlackoutWindow ─────────────────────────
+
+
+def test_window_is_35s_exact_passes() -> None:
+    assert _window(35.0).is_35s
+
+
+@pytest.mark.parametrize("duration", [33.5, 36.5])
+def test_window_is_35s_within_tolerance(duration: float) -> None:
+    assert _window(duration).is_35s
+
+
+@pytest.mark.parametrize("duration", [32.0, 38.0])
+def test_window_is_35s_outside_tolerance(duration: float) -> None:
+    assert not _window(duration).is_35s
+
+
+# ───────────────────────── AC-1 fix degrade ─────────────────────────
+
+
+def test_ac1_fix_degrades_within_budget_passes() -> None:
+    # Arrange — cov crosses 100 m at t=70s, fix drops at t=70.3s (300ms latency).
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 5000, cov=50.0, fix_type=3),
+        _est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3),  # crossing
+        _est(w.onset_monotonic_ms + 10_300, cov=120.0, fix_type=2),  # degrade
+    ]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.fix_degrade.cov2d_crossed_at_ms == w.onset_monotonic_ms + 10_000
+    assert r.fix_degrade.fix_degraded_at_ms == w.onset_monotonic_ms + 10_300
+    assert r.fix_degrade.latency_ms == 300
+    assert r.fix_degrade.passes
+
+
+def test_ac1_fix_degrade_above_budget_fails() -> None:
+    # Arrange — fix drops 700ms after crossing (>500ms budget).
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3),
+        _est(w.onset_monotonic_ms + 10_700, cov=120.0, fix_type=2),
+    ]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.fix_degrade.latency_ms == 700
+    assert not r.fix_degrade.passes
+
+
+def test_ac1_no_crossing_fails() -> None:
+    # Arrange — cov never reaches 100.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 5000, cov=50.0)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.fix_degrade.cov2d_crossed_at_ms is None
+    assert not r.fix_degrade.passes
+
+
+def test_ac1_crossing_but_no_fix_degrade_fails() -> None:
+    # Arrange
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3),
+        _est(w.onset_monotonic_ms + 11_000, cov=120.0, fix_type=3),  # stays at 3
+    ]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.fix_degrade.cov2d_crossed_at_ms is not None
+    assert r.fix_degrade.fix_degraded_at_ms is None
+    assert not r.fix_degrade.passes
+
+
+# ───────────────────────── AC-2 failsafe ─────────────────────────
+
+
+def test_ac2_cov500_trigger_passes_within_budget() -> None:
+    # Arrange — cov crosses 500 at t=25s; horiz=999 and STATUSTEXT both within 400ms.
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 25_000, cov=500.0, horiz=200.0, fix_type=2),  # trigger
+        _est(onset + 25_400, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 25_400, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert
+    assert r.failsafe.failsafe_trigger_at_ms == onset + 25_000
+    assert r.failsafe.horiz_999_latency_ms == 400
+    assert r.failsafe.statustext_latency_ms == 400
+    assert r.failsafe.passes
+
+
+def test_ac2_duration_trigger_used_when_cov500_never_crossed() -> None:
+    # Arrange — cov stays below 500; window crosses 30s at +30s.
+    w = _window(35.0)
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 10_000, cov=120.0, fix_type=2),
+        _est(onset + 30_000, cov=200.0, fix_type=2),  # duration trigger fires
+        _est(onset + 30_400, cov=200.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 30_300, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert — trigger at onset + 30s.
+    assert r.failsafe.failsafe_trigger_at_ms == onset + 30_000
+    assert r.failsafe.horiz_999_latency_ms == 400
+    assert r.failsafe.statustext_latency_ms == 300
+    assert r.failsafe.passes
+
+
+def test_ac2_uses_earliest_of_cov500_and_duration_trigger() -> None:
+    # Arrange — cov crosses 500 at +20s, duration trigger at +30s. Earliest wins.
+    w = _window(35.0)
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 20_000, cov=500.0, fix_type=2),  # cov trigger
+        _est(onset + 20_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 20_400, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert — picks the cov trigger (earlier than duration trigger).
+    assert r.failsafe.failsafe_trigger_at_ms == onset + 20_000
+
+
+def test_ac2_horiz_999_late_fails() -> None:
+    # Arrange — 999 arrives 700ms after trigger.
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 25_000, cov=500.0, fix_type=2),
+        _est(onset + 25_700, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert
+    assert r.failsafe.horiz_999_latency_ms == 700
+    assert not r.failsafe.horiz_999_passes
+    assert not r.failsafe.passes
+
+
+def test_ac2_missing_statustext_fails() -> None:
+    # Arrange
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 25_000, cov=500.0, fix_type=2),
+        _est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.failsafe.statustext_at_ms is None
+    assert not r.failsafe.statustext_passes
+    assert not r.failsafe.passes
+
+
+def test_ac2_no_trigger_fails() -> None:
+    # Arrange — cov never crosses 500 and window < 30s.
+    w = _window(20.0)
+    estimates = [_est(w.onset_monotonic_ms + 5000, cov=120.0, fix_type=2)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert r.failsafe.failsafe_trigger_at_ms is None
+    assert not r.failsafe.passes
+
+
+# ───────────────────────── AC-ORDER ─────────────────────────
+
+
+def test_ac_order_cov2d_before_failsafe_passes() -> None:
+    # Arrange — cov-2d at t=+10s; failsafe trigger at t=+25s.
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 10_000, cov=100.0, fix_type=3),
+        _est(onset + 10_300, cov=120.0, fix_type=2),
+        _est(onset + 25_000, cov=500.0, fix_type=2),
+        _est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 25_400, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert
+    assert r.ordering.passes
+    assert r.passes
+
+
+def test_ac_order_simultaneous_crossings_fails_ordering() -> None:
+    # Arrange — single sample crosses BOTH 100m and 500m at the same instant.
+    # By contract this is a strict-monotonicity violation: the SUT cannot
+    # publish a covariance > 500 m before the AC-1 escalation fired on the
+    # 100 m crossing.
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 25_000, cov=500.0, fix_type=2),  # both crossings
+        _est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert — cov2d_at == failsafe_trigger_at; ordering requires strict <.
+    assert r.ordering.cov2d_at_ms == r.ordering.failsafe_trigger_at_ms
+    assert not r.ordering.passes
+    assert not r.passes
+
+
+def test_ac_order_missing_pole_fails_ordering() -> None:
+    # Arrange — only cov-2d crosses; no failsafe trigger.
+    w = _window(20.0)
+    estimates = [
+        _est(w.onset_monotonic_ms + 5000, cov=100.0, fix_type=3),
+        _est(w.onset_monotonic_ms + 5300, cov=120.0, fix_type=2),
+    ]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=[])
+
+    # Assert
+    assert not r.ordering.passes
+
+
+# ───────────────────────── aggregate ─────────────────────────
+
+
+def test_aggregate_passes_when_all_ac_pass() -> None:
+    # Arrange — full clean escalation.
+    w = _window()
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 10_000, cov=100.0, fix_type=3),
+        _est(onset + 10_200, cov=120.0, fix_type=2),
+        _est(onset + 25_000, cov=500.0, fix_type=2),
+        _est(onset + 25_200, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert
+    assert r.passes
+
+
+def test_aggregate_fails_window_out_of_spec() -> None:
+    # Arrange — 25 s window (under tolerance).
+    w = _window(25.0)
+    onset = w.onset_monotonic_ms
+    estimates = [
+        _est(onset + 5_000, cov=100.0, fix_type=3),
+        _est(onset + 5_200, cov=120.0, fix_type=2),
+        _est(onset + 15_000, cov=500.0, fix_type=2),
+        _est(onset + 15_200, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
+    ]
+    statustexts = [_st(onset + 15_300, ele.STATUSTEXT_FAILSAFE)]
+
+    # Act
+    r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
+
+    # Assert
+    assert not r.passes_window
+    assert not r.passes
+
+
+# ───────────────────────── csv emit ─────────────────────────
+
+
+def test_write_csv_evidence_one_row(tmp_path: Path) -> None:
+    # Arrange
+    w = _window()
+    r = ele.evaluate(w, estimates=[], statustexts=[])
+    out_path = tmp_path / "nft-res-04.csv"
+
+    # Act
+    ele.write_csv_evidence(out_path, r)
+
+    # Assert
+    rows = out_path.read_text().splitlines()
+    assert len(rows) == 2
+    assert rows[0].startswith("window_duration_s")
+    assert "ac1_passes" in rows[0]
+    assert "ac2_passes" in rows[0]
+    assert "ac_order_passes" in rows[0]
@@ -0,0 +1,318 @@
+"""Unit tests for ``runner.helpers.imu_fallback_drift_evaluator`` (AZ-432 / NFT-RES-01)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import imu_fallback_drift_evaluator as ife
+from runner.helpers.geo import distance_m
+
+
+# Reference point in Derkachi (used so distances are realistic): 49.9°N, 36.3°E.
+REF_LAT = 49.9
+REF_LON = 36.3
+
+
+def _offset_lat(lat: float, north_m: float) -> float:
+    """Approximate latitude offset by ``north_m`` meters (good enough for tests)."""
+    # ~111_320 m per degree of latitude at WGS84.
+    return lat + north_m / 111_320.0
+
+
+def _window(duration_s: float, onset_ms: int = 60_000) -> ife.BlackoutWindow:
+    return ife.BlackoutWindow(
+        onset_monotonic_ms=onset_ms,
+        end_monotonic_ms=onset_ms + int(duration_s * 1000),
+    )
+
+
+def _samples(*pairs: tuple[int, float]) -> tuple[ife.PositionSample, ...]:
+    """Build samples at REF_LON; second component is northward offset in meters."""
+    return tuple(
+        ife.PositionSample(
+            monotonic_ms=t_ms,
+            lat_deg=_offset_lat(REF_LAT, north_m),
+            lon_deg=REF_LON,
+        )
+        for t_ms, north_m in pairs
+    )
+
+
+# ───────────────────────── BlackoutWindow ─────────────────────────
+
+
+def test_window_in_spec_exact_30s_passes() -> None:
+    # Act
+    w = _window(30.0)
+    # Assert
+    assert w.duration_s == pytest.approx(30.0)
+    assert w.window_in_spec
+
+
+@pytest.mark.parametrize("duration", [28.5, 31.5])
+def test_window_in_spec_within_tolerance(duration: float) -> None:
+    # Assert
+    assert _window(duration).window_in_spec
+
+
+@pytest.mark.parametrize("duration", [27.0, 33.0])
+def test_window_in_spec_outside_tolerance(duration: float) -> None:
+    # Assert
+    assert not _window(duration).window_in_spec
+
+
+# ───────────────────────── evaluate_subcase ─────────────────────────
+
+
+def test_evaluate_subcase_unknown_subcase_raises() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        ife.evaluate_subcase(
+            _window(30.0),
+            estimates=_samples((90_000, 60.0)),
+            ground_truth=_samples((90_000, 0.0)),
+            subcase="some_typo",
+        )
+
+
+def test_evaluate_subcase_no_imu_within_budget_passes() -> None:
+    # Arrange — 80 m drift at blackout end, budget is 100 m.
+    w = _window(30.0)
+    estimates = _samples((w.end_monotonic_ms, 80.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
+
+    # Assert
+    assert report.subcase == ife.SUBCASE_NO_IMU
+    assert report.budget_m == ife.NO_IMU_BUDGET_M
+    assert report.drift_m is not None
+    assert report.drift_m == pytest.approx(80.0, abs=0.5)
+    assert report.passes
+
+
+def test_evaluate_subcase_no_imu_above_budget_fails() -> None:
+    # Arrange — 120 m drift, budget 100 m.
+    w = _window(30.0)
+    estimates = _samples((w.end_monotonic_ms, 120.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
+
+    # Assert
+    assert report.drift_m is not None
+    assert report.drift_m == pytest.approx(120.0, abs=0.5)
+    assert not report.passes
+
+
+def test_evaluate_subcase_good_imu_within_budget_passes() -> None:
+    # Arrange — 40 m drift, budget 50 m.
+    w = _window(30.0)
+    estimates = _samples((w.end_monotonic_ms, 40.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
+
+    # Assert
+    assert report.budget_m == ife.GOOD_IMU_BUDGET_M
+    assert report.passes
+
+
+def test_evaluate_subcase_good_imu_above_budget_fails() -> None:
+    # Arrange — 70 m drift, budget 50 m.
+    w = _window(30.0)
+    estimates = _samples((w.end_monotonic_ms, 70.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
+
+    # Assert
+    assert not report.passes
+
+
+def test_evaluate_subcase_no_estimate_at_end_returns_none_drift() -> None:
+    # Arrange — only estimate sample is BEFORE blackout onset.
+    w = _window(30.0)
+    estimates = _samples((w.onset_monotonic_ms - 5_000, 0.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
+
+    # Assert — sample at onset−5s is valid because it's ≤ end_ms, so drift is 0
+    # (estimate hasn't moved). The "no estimate" case is when nothing is ≤ end_ms.
+    assert report.estimate_at_end is not None  # onset-5s ≤ end_ms
+
+    estimates_empty: tuple[ife.PositionSample, ...] = ()
+    report2 = ife.evaluate_subcase(w, estimates_empty, gt, subcase=ife.SUBCASE_NO_IMU)
+    assert report2.drift_m is None
+    assert not report2.passes
+
+
+def test_evaluate_subcase_picks_latest_sample_at_or_before_end() -> None:
+    # Arrange — three estimate samples, only the second is at the boundary.
+    w = _window(30.0)
+    estimates = _samples(
+        (w.end_monotonic_ms - 1000, 30.0),
+        (w.end_monotonic_ms, 50.0),
+        (w.end_monotonic_ms + 1000, 200.0),  # AFTER the window — must be ignored
+    )
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
+
+    # Assert
+    assert report.estimate_at_end is not None
+    assert report.estimate_at_end.monotonic_ms == w.end_monotonic_ms
+    assert report.drift_m == pytest.approx(50.0, abs=0.5)
+
+
+def test_evaluate_subcase_explicit_budget_overrides_default() -> None:
+    # Arrange — 60 m drift, default no_imu budget=100 (would pass); override to 40.
+    w = _window(30.0)
+    estimates = _samples((w.end_monotonic_ms, 60.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate_subcase(
+        w, estimates, gt, subcase=ife.SUBCASE_NO_IMU, budget_m=40.0
+    )
+
+    # Assert
+    assert report.budget_m == 40.0
+    assert not report.passes
+
+
+# ───────────────────────── evaluate (aggregate) ─────────────────────────
+
+
+def test_evaluate_both_subcases_pass() -> None:
+    # Arrange
+    w = _window(30.0)
+    no_imu_est = _samples((w.end_monotonic_ms, 80.0))
+    good_imu_est = _samples((w.end_monotonic_ms, 40.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate(
+        w,
+        sub_cases=[
+            (ife.SUBCASE_NO_IMU, no_imu_est, gt),
+            (ife.SUBCASE_GOOD_IMU, good_imu_est, gt),
+        ],
+    )
+
+    # Assert
+    assert len(report.sub_cases) == 2
+    assert report.passes_window
+    assert report.passes
+    assert report.by_subcase(ife.SUBCASE_NO_IMU).passes
+    assert report.by_subcase(ife.SUBCASE_GOOD_IMU).passes
+
+
+def test_evaluate_one_subcase_fails_fails_aggregate() -> None:
+    # Arrange
+    w = _window(30.0)
+    no_imu_est = _samples((w.end_monotonic_ms, 80.0))  # passes (≤100)
+    good_imu_est = _samples((w.end_monotonic_ms, 70.0))  # fails (>50)
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate(
+        w,
+        sub_cases=[
+            (ife.SUBCASE_NO_IMU, no_imu_est, gt),
+            (ife.SUBCASE_GOOD_IMU, good_imu_est, gt),
+        ],
+    )
+
+    # Assert
+    assert not report.passes
+
+
+def test_evaluate_window_out_of_spec_fails_even_if_drift_passes() -> None:
+    # Arrange — only 25 s window (under tolerance).
+    w = _window(25.0)
+    estimates = _samples((w.end_monotonic_ms, 10.0))
+    gt = _samples((w.end_monotonic_ms, 0.0))
+
+    # Act
+    report = ife.evaluate(
+        w, sub_cases=[(ife.SUBCASE_NO_IMU, estimates, gt)]
+    )
+
+    # Assert
+    assert not report.passes_window
+    assert not report.passes
+
+
+def test_evaluate_by_subcase_unknown_raises() -> None:
+    # Arrange
+    w = _window(30.0)
+    report = ife.evaluate(w, sub_cases=[])
+
+    # Assert
+    with pytest.raises(KeyError):
+        report.by_subcase(ife.SUBCASE_NO_IMU)
+
+
+# ───────────────────────── distance sanity ─────────────────────────
+
+
+def test_helper_offset_lat_aligns_with_vincenty_within_meter() -> None:
+    """Cross-check our convenience helper against the real Vincenty distance."""
+    # Arrange
+    target = _offset_lat(REF_LAT, 100.0)
+
+    # Act
+    d = distance_m(REF_LAT, REF_LON, target, REF_LON)
+
+    # Assert — flat-earth approximation is within ~1 m for 100 m offsets.
+    assert abs(d - 100.0) < 1.0
+
+
+# ───────────────────────── csv emit ─────────────────────────
+
+
+def test_write_csv_evidence_one_row_per_subcase(tmp_path: Path) -> None:
+    # Arrange
+    w = _window(30.0)
+    report = ife.evaluate(
+        w,
+        sub_cases=[
+            (ife.SUBCASE_NO_IMU, _samples((w.end_monotonic_ms, 80.0)), _samples((w.end_monotonic_ms, 0.0))),
+            (ife.SUBCASE_GOOD_IMU, _samples((w.end_monotonic_ms, 40.0)), _samples((w.end_monotonic_ms, 0.0))),
+        ],
+    )
+    out_path = tmp_path / "nft-res-01.csv"
+
+    # Act
+    ife.write_csv_evidence(out_path, report)
+
+    # Assert
+    rows = out_path.read_text().splitlines()
+    assert rows[0].startswith("subcase,")
+    assert "drift_m" in rows[0]
+    assert len(rows) == 3  # header + 2 sub-cases
+    assert ife.SUBCASE_NO_IMU in rows[1] or ife.SUBCASE_NO_IMU in rows[2]
+    assert ife.SUBCASE_GOOD_IMU in rows[1] or ife.SUBCASE_GOOD_IMU in rows[2]
+
+
+def test_write_csv_evidence_creates_parent_dir(tmp_path: Path) -> None:
+    # Arrange
+    w = _window(30.0)
+    report = ife.evaluate(w, sub_cases=[])
+    out_path = tmp_path / "nested" / "deep" / "nft-res-01.csv"
+
+    # Act
+    ife.write_csv_evidence(out_path, report)
+
+    # Assert
+    assert out_path.exists()
@@ -0,0 +1,227 @@
+"""Unit tests for ``runner.helpers.monte_carlo_envelope_evaluator`` (AZ-434 / NFT-RES-03)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import monte_carlo_envelope_evaluator as mce
+
+
+def _iter(iter_id: str, seed: int, samples: list[tuple[float, float]]) -> mce.IterationOutcome:
+    return mce.IterationOutcome(
+        iteration_id=iter_id,
+        iteration_seed=seed,
+        samples=tuple(mce.FrameSample(error_m=e, cov_semi_major_m=c) for e, c in samples),
+    )
+
+
+def _uniform_iter(iter_id: str, seed: int, *, n: int, error_m: float, cov_m: float) -> mce.IterationOutcome:
+    return _iter(iter_id, seed, [(error_m, cov_m) for _ in range(n)])
+
+
+# ───────────────────────── AC-1 iteration count ─────────────────────────
+
+
+def test_ac1_at_min_iterations_passes() -> None:
+    # Arrange — 100 iterations × 1 sample each, all covered.
+    iterations = [_uniform_iter(f"it{i:03d}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(100)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=42)
+
+    # Assert
+    assert report.iteration_count == 100
+    assert report.passes_iteration_count
+    assert report.passes
+
+
+def test_ac1_below_min_iterations_fails() -> None:
+    # Arrange — 99 iterations.
+    iterations = [_uniform_iter(f"it{i:03d}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(99)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=42)
+
+    # Assert
+    assert report.iteration_count == 99
+    assert not report.passes_iteration_count
+    assert not report.passes
+
+
+def test_ac1_custom_min_iteration_count() -> None:
+    # Arrange — 5 iterations with custom min=5.
+    iterations = [_uniform_iter(f"it{i}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(5)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=1, min_iteration_count=5)
+
+    # Assert
+    assert report.passes_iteration_count
+
+
+# ───────────────────────── AC-3 envelope ratio ─────────────────────────
+
+
+def test_ac3_all_covered_passes() -> None:
+    # Arrange — error = 0.5 * cov, multiplier 1.96 → trivially covered.
+    iterations = [_uniform_iter(f"it{i}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(100)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=0)
+
+    # Assert
+    assert report.total_samples == 1000
+    assert report.covered_samples == 1000
+    assert report.envelope_ratio == pytest.approx(1.0)
+    assert report.passes_envelope
+
+
+def test_ac3_at_budget_passes() -> None:
+    # Arrange — 95 covered + 5 not covered = ratio 0.95 exactly.
+    covered = [_uniform_iter(f"c{i}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(95)]
+    uncovered = [_uniform_iter(f"u{i}", 1000 + i, n=1, error_m=3.0, cov_m=1.0) for i in range(5)]
+
+    # Act
+    report = mce.evaluate(covered + uncovered, master_seed=0)
+
+    # Assert — 3.0 > 1.96, so uncovered. 95/100 = 0.95 exactly.
+    assert report.envelope_ratio == pytest.approx(0.95)
+    assert report.passes_envelope
+
+
+def test_ac3_below_budget_fails() -> None:
+    # Arrange — 90 covered + 10 not covered = ratio 0.90.
+    covered = [_uniform_iter(f"c{i}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(90)]
+    uncovered = [_uniform_iter(f"u{i}", 1000 + i, n=1, error_m=3.0, cov_m=1.0) for i in range(10)]
+
+    # Act
+    report = mce.evaluate(covered + uncovered, master_seed=0)
+
+    # Assert
+    assert report.envelope_ratio == pytest.approx(0.90)
+    assert not report.passes_envelope
+
+
+def test_ac3_edge_error_exactly_at_envelope_counts_as_covered() -> None:
+    # Arrange — error = 1.96 * cov should still count.
+    iterations = [_uniform_iter(f"it{i}", i, n=1, error_m=1.96, cov_m=1.0) for i in range(100)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=0)
+
+    # Assert
+    assert report.envelope_ratio == pytest.approx(1.0)
+
+
+def test_ac3_empty_iterations_returns_none_ratio() -> None:
+    # Act
+    report = mce.evaluate([], master_seed=0)
+
+    # Assert
+    assert report.total_samples == 0
+    assert report.envelope_ratio is None
+    assert not report.passes_envelope
+    assert not report.passes
+
+
+def test_ac3_custom_envelope_multiplier() -> None:
+    # Arrange — error = 1.0, cov = 1.0; default multiplier 1.96 → covered;
+    # custom multiplier 0.5 → uncovered.
+    iterations = [_uniform_iter(f"it{i}", i, n=1, error_m=1.0, cov_m=1.0) for i in range(100)]
+
+    # Act
+    report = mce.evaluate(iterations, master_seed=0, envelope_multiplier=0.5)
+
+    # Assert — 1.0 > 0.5 * 1.0 = 0.5 → uncovered.
+    assert report.envelope_ratio == pytest.approx(0.0)
+    assert not report.passes_envelope
+
+
+# ───────────────────────── AC-2 determinism ─────────────────────────
+
+
+def test_iteration_hash_same_for_same_inputs() -> None:
+    # Arrange
+    it1 = _iter("it1", 7, [(0.5, 1.0), (0.6, 1.0), (0.4, 0.9)])
+    it2 = _iter("it1", 7, [(0.5, 1.0), (0.6, 1.0), (0.4, 0.9)])
+
+    # Assert
+    assert mce.iteration_hash(it1) == mce.iteration_hash(it2)
+
+
+def test_iteration_hash_differs_when_samples_differ() -> None:
+    # Arrange
+    it1 = _iter("it1", 7, [(0.5, 1.0)])
+    it2 = _iter("it1", 7, [(0.6, 1.0)])
+
+    # Assert
+    assert mce.iteration_hash(it1) != mce.iteration_hash(it2)
+
+
+def test_iteration_hash_differs_when_seed_differs() -> None:
+    # Arrange
+    it1 = _iter("it1", 7, [(0.5, 1.0)])
+    it2 = _iter("it1", 8, [(0.5, 1.0)])
+
+    # Assert
+    assert mce.iteration_hash(it1) != mce.iteration_hash(it2)
+
+
+def test_determinism_fingerprint_stable_across_two_evaluate_calls() -> None:
+    # Arrange
+    iterations = [_uniform_iter(f"it{i:03d}", i, n=3, error_m=0.5, cov_m=1.0) for i in range(10)]
+    r1 = mce.evaluate(iterations, master_seed=42)
+    r2 = mce.evaluate(iterations, master_seed=42)
+
+    # Assert
+    assert mce.determinism_fingerprint(r1) == mce.determinism_fingerprint(r2)
+
+
+def test_determinism_fingerprint_differs_when_master_seed_changes() -> None:
+    # Arrange
+    iterations = [_uniform_iter(f"it{i:03d}", i, n=3, error_m=0.5, cov_m=1.0) for i in range(10)]
+    r1 = mce.evaluate(iterations, master_seed=42)
+    r2 = mce.evaluate(iterations, master_seed=43)
+
+    # Assert
+    assert mce.determinism_fingerprint(r1) != mce.determinism_fingerprint(r2)
+
+
+# ───────────────────────── csv emit ─────────────────────────
+
+
+def test_write_csv_evidence_emits_summary(tmp_path: Path) -> None:
+    # Arrange
+    iterations = [_uniform_iter(f"it{i:03d}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(100)]
+    report = mce.evaluate(iterations, master_seed=42)
+    out_path = tmp_path / "nft-res-03.csv"
+
+    # Act
+    mce.write_csv_evidence(out_path, report)
+
+    # Assert
+    rows = out_path.read_text().splitlines()
+    assert len(rows) == 2
+    assert rows[0].startswith("master_seed")
+    assert "ac1_iteration_count_passes" in rows[0]
+    assert "ac3_envelope_passes" in rows[0]
+    assert "fingerprint_sha256" in rows[0]
+
+
+def test_write_per_iteration_csv_row_per_iter(tmp_path: Path) -> None:
+    # Arrange
+    iterations = [_uniform_iter(f"it{i}", i, n=2, error_m=0.5, cov_m=1.0) for i in range(5)]
+    report = mce.evaluate(iterations, master_seed=1, min_iteration_count=5)
+    out_path = tmp_path / "per-iter.csv"
+
+    # Act
+    mce.write_per_iteration_csv(out_path, report)
+
+    # Assert
+    rows = out_path.read_text().splitlines()
+    assert rows[0] == (
+        "iteration_id,iteration_seed,frame_count,covered_count,envelope_ratio,iteration_hash_sha256"
+    )
+    assert len(rows) == 6  # header + 5 iterations