[AZ-432] [AZ-433] [AZ-434] [AZ-435] Add NFT-RES-01..04 resilience scenarios

Batch 86: 4 NFT-RES blackbox scenarios + 4 helper evaluators + 74 unit
tests + directory-layout registration.

* AZ-432 NFT-RES-01: 30 s IMU-only fallback drift bound (AC-3.5 + AC-NEW-7);
  two sub-cases (no_imu ≤100m, good_imu_combined_factor ≤50m).
* AZ-433 NFT-RES-02: companion mid-flight reboot (AC-5.2 + AC-5.3); resume
  ≤30s + first-emission accuracy ≤100m.
* AZ-434 NFT-RES-03: 100-iteration Monte Carlo envelope (AC-NEW-4);
  iteration-count + master-seed determinism + envelope ratio ≥0.95.
  Canonical-param by default; E2E_NFT_RES_03_FULL_MATRIX=1 unlocks matrix.
* AZ-435 NFT-RES-04: 35s blackout+spoof escalation ladder (AC-NEW-8);
  AC-1 (cov-2d→fix-degrade ≤500ms) + AC-2 (failsafe→999+STATUSTEXT
  ≤500ms) + AC-ORDER (strict ordering).

Verdict: PASS_WITH_WARNINGS (0 Critical, 0 High, 0 Medium, 5 Low).
F5 documents intentional threshold duplication with blackout_spoof
evaluator (prevents contract drift between FT-N-04 and NFT-RES-04).

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-17 17:09:04 +03:00
parent 23640a784f
commit 330893be5c
15 changed files with 3325 additions and 0 deletions
@@ -0,0 +1,278 @@
"""Unit tests for ``runner.helpers.companion_reboot_evaluator`` (AZ-433 / NFT-RES-02)."""
from __future__ import annotations
from pathlib import Path
import pytest
from runner.helpers import companion_reboot_evaluator as cre
REF_LAT = 49.9
REF_LON = 36.3
def _offset_lat(lat: float, north_m: float) -> float:
return lat + north_m / 111_320.0
def _evidence(
*,
restart_command_ms: int = 60_000,
process_restarted_ms: int | None = 62_000,
first_emission_ms: int | None = 80_000,
estimate_offset_m: float | None = 0.0,
gt_offset_m: float | None = 0.0,
) -> cre.RestartEvidence:
estimate = (
None
if estimate_offset_m is None
else cre.GeoFix(
monotonic_ms=first_emission_ms or 0,
lat_deg=_offset_lat(REF_LAT, estimate_offset_m),
lon_deg=REF_LON,
)
)
gt = (
None
if gt_offset_m is None
else cre.GeoFix(
monotonic_ms=first_emission_ms or 0,
lat_deg=_offset_lat(REF_LAT, gt_offset_m),
lon_deg=REF_LON,
)
)
return cre.RestartEvidence(
restart_command_monotonic_ms=restart_command_ms,
process_restarted_monotonic_ms=process_restarted_ms,
first_post_restart_emission_monotonic_ms=first_emission_ms,
first_post_restart_estimate=estimate,
ground_truth_at_first_emission=gt,
)
# ───────────────────────── happy path ─────────────────────────
def test_evaluate_clean_run_passes_all_acs() -> None:
# Arrange — 2 s restart latency, 20 s resume, 50 m drift.
evidence = _evidence(
process_restarted_ms=62_000,
first_emission_ms=80_000,
estimate_offset_m=50.0,
gt_offset_m=0.0,
)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.restart_trigger_latency_s == pytest.approx(2.0)
assert report.resume_time_s == pytest.approx(20.0)
assert report.first_emission_accuracy_m == pytest.approx(50.0, abs=0.5)
assert report.passes_restart_trigger
assert report.passes_resume_time
assert report.passes_first_emission_accuracy
assert report.passes
# ───────────────────────── AC-1: restart trigger ─────────────────────────
def test_ac1_at_budget_passes() -> None:
# Arrange — restart latency = exactly 5 s.
evidence = _evidence(process_restarted_ms=60_000 + 5000)
# Act + Assert
assert cre.evaluate(evidence).passes_restart_trigger
def test_ac1_above_budget_fails() -> None:
# Arrange
evidence = _evidence(process_restarted_ms=60_000 + 6000)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.restart_trigger_latency_s == pytest.approx(6.0)
assert not report.passes_restart_trigger
assert not report.passes
def test_ac1_process_never_restarts_fails() -> None:
# Arrange
evidence = _evidence(process_restarted_ms=None)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.restart_trigger_latency_s is None
assert not report.passes_restart_trigger
assert not report.passes
def test_ac1_negative_latency_raises() -> None:
# Arrange — clock skew bug.
evidence = _evidence(process_restarted_ms=60_000 - 1000)
# Assert
with pytest.raises(ValueError):
cre.evaluate(evidence)
# ───────────────────────── AC-2: resume time ─────────────────────────
def test_ac2_at_budget_passes() -> None:
# Arrange — resume = exactly 30 s.
evidence = _evidence(first_emission_ms=60_000 + 30_000)
# Act + Assert
assert cre.evaluate(evidence).passes_resume_time
def test_ac2_above_budget_fails() -> None:
# Arrange — resume = 31 s.
evidence = _evidence(first_emission_ms=60_000 + 31_000)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.resume_time_s == pytest.approx(31.0)
assert not report.passes_resume_time
def test_ac2_no_emission_fails() -> None:
# Arrange
evidence = _evidence(first_emission_ms=None, estimate_offset_m=None, gt_offset_m=None)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.resume_time_s is None
assert not report.passes_resume_time
assert not report.passes
def test_ac2_negative_resume_raises() -> None:
# Arrange — emission before restart command.
evidence = _evidence(first_emission_ms=59_000)
# Assert
with pytest.raises(ValueError):
cre.evaluate(evidence)
# ───────────────────────── AC-3: first-emission accuracy ─────────────────────────
def test_ac3_within_budget_passes() -> None:
# Arrange — 80 m drift, budget 100 m.
evidence = _evidence(estimate_offset_m=80.0, gt_offset_m=0.0)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.first_emission_accuracy_m == pytest.approx(80.0, abs=0.5)
assert report.passes_first_emission_accuracy
def test_ac3_at_budget_passes() -> None:
# Arrange — exactly 100 m.
evidence = _evidence(estimate_offset_m=100.0, gt_offset_m=0.0)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.passes_first_emission_accuracy
def test_ac3_above_budget_fails() -> None:
# Arrange — 120 m drift.
evidence = _evidence(estimate_offset_m=120.0, gt_offset_m=0.0)
# Act
report = cre.evaluate(evidence)
# Assert
assert not report.passes_first_emission_accuracy
assert not report.passes
def test_ac3_missing_estimate_fails() -> None:
# Arrange
evidence = _evidence(estimate_offset_m=None)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.first_emission_accuracy_m is None
assert not report.passes_first_emission_accuracy
def test_ac3_missing_gt_fails() -> None:
# Arrange
evidence = _evidence(gt_offset_m=None)
# Act
report = cre.evaluate(evidence)
# Assert
assert report.first_emission_accuracy_m is None
assert not report.passes_first_emission_accuracy
# ───────────────────────── custom budgets ─────────────────────────
def test_custom_budgets_apply() -> None:
# Arrange — 8 s restart latency (above default 5s) but custom 10s budget passes.
evidence = _evidence(process_restarted_ms=68_000)
# Act
report = cre.evaluate(evidence, restart_trigger_budget_s=10.0)
# Assert
assert report.restart_trigger_budget_s == 10.0
assert report.passes_restart_trigger
# ───────────────────────── csv emit ─────────────────────────
def test_write_csv_evidence_emits_summary(tmp_path: Path) -> None:
# Arrange
evidence = _evidence(estimate_offset_m=50.0, gt_offset_m=0.0)
report = cre.evaluate(evidence)
out_path = tmp_path / "nft-res-02.csv"
# Act
cre.write_csv_evidence(out_path, report)
# Assert
rows = out_path.read_text().splitlines()
assert len(rows) == 2
assert rows[0].startswith("restart_trigger_latency_s")
assert "ac1_passes" in rows[0]
assert "ac2_passes" in rows[0]
assert "ac3_passes" in rows[0]
def test_write_csv_evidence_creates_parent_dir(tmp_path: Path) -> None:
# Arrange
evidence = _evidence()
report = cre.evaluate(evidence)
out_path = tmp_path / "deep" / "nest" / "out.csv"
# Act
cre.write_csv_evidence(out_path, report)
# Assert
assert out_path.exists()
@@ -0,0 +1,348 @@
"""Unit tests for ``runner.helpers.escalation_ladder_evaluator`` (AZ-435 / NFT-RES-04)."""
from __future__ import annotations
from pathlib import Path
import pytest
from runner.helpers import escalation_ladder_evaluator as ele
def _window(duration_s: float = 35.0, onset_ms: int = 60_000) -> ele.BlackoutWindow:
return ele.BlackoutWindow(
onset_monotonic_ms=onset_ms,
end_monotonic_ms=onset_ms + int(duration_s * 1000),
)
def _est(t_ms: int, *, cov: float, horiz: float = 5.0, fix_type: int = 3) -> ele.EstimateSample:
return ele.EstimateSample(
monotonic_ms=t_ms,
cov_semi_major_m=cov,
horiz_accuracy=horiz,
fix_type=fix_type,
)
def _st(t_ms: int, text: str) -> ele.StatustextSample:
return ele.StatustextSample(monotonic_ms=t_ms, text=text)
# ───────────────────────── BlackoutWindow ─────────────────────────
def test_window_is_35s_exact_passes() -> None:
assert _window(35.0).is_35s
@pytest.mark.parametrize("duration", [33.5, 36.5])
def test_window_is_35s_within_tolerance(duration: float) -> None:
assert _window(duration).is_35s
@pytest.mark.parametrize("duration", [32.0, 38.0])
def test_window_is_35s_outside_tolerance(duration: float) -> None:
assert not _window(duration).is_35s
# ───────────────────────── AC-1 fix degrade ─────────────────────────
def test_ac1_fix_degrades_within_budget_passes() -> None:
# Arrange — cov crosses 100 m at t=70s, fix drops at t=70.3s (300ms latency).
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 5000, cov=50.0, fix_type=3),
_est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3), # crossing
_est(w.onset_monotonic_ms + 10_300, cov=120.0, fix_type=2), # degrade
]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.fix_degrade.cov2d_crossed_at_ms == w.onset_monotonic_ms + 10_000
assert r.fix_degrade.fix_degraded_at_ms == w.onset_monotonic_ms + 10_300
assert r.fix_degrade.latency_ms == 300
assert r.fix_degrade.passes
def test_ac1_fix_degrade_above_budget_fails() -> None:
# Arrange — fix drops 700ms after crossing (>500ms budget).
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3),
_est(w.onset_monotonic_ms + 10_700, cov=120.0, fix_type=2),
]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.fix_degrade.latency_ms == 700
assert not r.fix_degrade.passes
def test_ac1_no_crossing_fails() -> None:
# Arrange — cov never reaches 100.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 5000, cov=50.0)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.fix_degrade.cov2d_crossed_at_ms is None
assert not r.fix_degrade.passes
def test_ac1_crossing_but_no_fix_degrade_fails() -> None:
# Arrange
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 10_000, cov=100.0, fix_type=3),
_est(w.onset_monotonic_ms + 11_000, cov=120.0, fix_type=3), # stays at 3
]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.fix_degrade.cov2d_crossed_at_ms is not None
assert r.fix_degrade.fix_degraded_at_ms is None
assert not r.fix_degrade.passes
# ───────────────────────── AC-2 failsafe ─────────────────────────
def test_ac2_cov500_trigger_passes_within_budget() -> None:
# Arrange — cov crosses 500 at t=25s; horiz=999 and STATUSTEXT both within 400ms.
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 25_000, cov=500.0, horiz=200.0, fix_type=2), # trigger
_est(onset + 25_400, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 25_400, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert
assert r.failsafe.failsafe_trigger_at_ms == onset + 25_000
assert r.failsafe.horiz_999_latency_ms == 400
assert r.failsafe.statustext_latency_ms == 400
assert r.failsafe.passes
def test_ac2_duration_trigger_used_when_cov500_never_crossed() -> None:
# Arrange — cov stays below 500; window crosses 30s at +30s.
w = _window(35.0)
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 10_000, cov=120.0, fix_type=2),
_est(onset + 30_000, cov=200.0, fix_type=2), # duration trigger fires
_est(onset + 30_400, cov=200.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 30_300, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert — trigger at onset + 30s.
assert r.failsafe.failsafe_trigger_at_ms == onset + 30_000
assert r.failsafe.horiz_999_latency_ms == 400
assert r.failsafe.statustext_latency_ms == 300
assert r.failsafe.passes
def test_ac2_uses_earliest_of_cov500_and_duration_trigger() -> None:
# Arrange — cov crosses 500 at +20s, duration trigger at +30s. Earliest wins.
w = _window(35.0)
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 20_000, cov=500.0, fix_type=2), # cov trigger
_est(onset + 20_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 20_400, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert — picks the cov trigger (earlier than duration trigger).
assert r.failsafe.failsafe_trigger_at_ms == onset + 20_000
def test_ac2_horiz_999_late_fails() -> None:
# Arrange — 999 arrives 700ms after trigger.
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 25_000, cov=500.0, fix_type=2),
_est(onset + 25_700, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert
assert r.failsafe.horiz_999_latency_ms == 700
assert not r.failsafe.horiz_999_passes
assert not r.failsafe.passes
def test_ac2_missing_statustext_fails() -> None:
# Arrange
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 25_000, cov=500.0, fix_type=2),
_est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.failsafe.statustext_at_ms is None
assert not r.failsafe.statustext_passes
assert not r.failsafe.passes
def test_ac2_no_trigger_fails() -> None:
# Arrange — cov never crosses 500 and window < 30s.
w = _window(20.0)
estimates = [_est(w.onset_monotonic_ms + 5000, cov=120.0, fix_type=2)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert r.failsafe.failsafe_trigger_at_ms is None
assert not r.failsafe.passes
# ───────────────────────── AC-ORDER ─────────────────────────
def test_ac_order_cov2d_before_failsafe_passes() -> None:
# Arrange — cov-2d at t=+10s; failsafe trigger at t=+25s.
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 10_000, cov=100.0, fix_type=3),
_est(onset + 10_300, cov=120.0, fix_type=2),
_est(onset + 25_000, cov=500.0, fix_type=2),
_est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 25_400, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert
assert r.ordering.passes
assert r.passes
def test_ac_order_simultaneous_crossings_fails_ordering() -> None:
# Arrange — single sample crosses BOTH 100m and 500m at the same instant.
# By contract this is a strict-monotonicity violation: the SUT cannot
# publish a covariance > 500 m before the AC-1 escalation fired on the
# 100 m crossing.
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 25_000, cov=500.0, fix_type=2), # both crossings
_est(onset + 25_300, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert — cov2d_at == failsafe_trigger_at; ordering requires strict <.
assert r.ordering.cov2d_at_ms == r.ordering.failsafe_trigger_at_ms
assert not r.ordering.passes
assert not r.passes
def test_ac_order_missing_pole_fails_ordering() -> None:
# Arrange — only cov-2d crosses; no failsafe trigger.
w = _window(20.0)
estimates = [
_est(w.onset_monotonic_ms + 5000, cov=100.0, fix_type=3),
_est(w.onset_monotonic_ms + 5300, cov=120.0, fix_type=2),
]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=[])
# Assert
assert not r.ordering.passes
# ───────────────────────── aggregate ─────────────────────────
def test_aggregate_passes_when_all_ac_pass() -> None:
# Arrange — full clean escalation.
w = _window()
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 10_000, cov=100.0, fix_type=3),
_est(onset + 10_200, cov=120.0, fix_type=2),
_est(onset + 25_000, cov=500.0, fix_type=2),
_est(onset + 25_200, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 25_300, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert
assert r.passes
def test_aggregate_fails_window_out_of_spec() -> None:
# Arrange — 25 s window (under tolerance).
w = _window(25.0)
onset = w.onset_monotonic_ms
estimates = [
_est(onset + 5_000, cov=100.0, fix_type=3),
_est(onset + 5_200, cov=120.0, fix_type=2),
_est(onset + 15_000, cov=500.0, fix_type=2),
_est(onset + 15_200, cov=520.0, horiz=ele.HORIZ_ACCURACY_FAILSAFE, fix_type=2),
]
statustexts = [_st(onset + 15_300, ele.STATUSTEXT_FAILSAFE)]
# Act
r = ele.evaluate(w, estimates=estimates, statustexts=statustexts)
# Assert
assert not r.passes_window
assert not r.passes
# ───────────────────────── csv emit ─────────────────────────
def test_write_csv_evidence_one_row(tmp_path: Path) -> None:
# Arrange
w = _window()
r = ele.evaluate(w, estimates=[], statustexts=[])
out_path = tmp_path / "nft-res-04.csv"
# Act
ele.write_csv_evidence(out_path, r)
# Assert
rows = out_path.read_text().splitlines()
assert len(rows) == 2
assert rows[0].startswith("window_duration_s")
assert "ac1_passes" in rows[0]
assert "ac2_passes" in rows[0]
assert "ac_order_passes" in rows[0]
@@ -0,0 +1,318 @@
"""Unit tests for ``runner.helpers.imu_fallback_drift_evaluator`` (AZ-432 / NFT-RES-01)."""
from __future__ import annotations
from pathlib import Path
import pytest
from runner.helpers import imu_fallback_drift_evaluator as ife
from runner.helpers.geo import distance_m
# Reference point in Derkachi (used so distances are realistic): 49.9°N, 36.3°E.
REF_LAT = 49.9
REF_LON = 36.3
def _offset_lat(lat: float, north_m: float) -> float:
"""Approximate latitude offset by ``north_m`` meters (good enough for tests)."""
# ~111_320 m per degree of latitude at WGS84.
return lat + north_m / 111_320.0
def _window(duration_s: float, onset_ms: int = 60_000) -> ife.BlackoutWindow:
return ife.BlackoutWindow(
onset_monotonic_ms=onset_ms,
end_monotonic_ms=onset_ms + int(duration_s * 1000),
)
def _samples(*pairs: tuple[int, float]) -> tuple[ife.PositionSample, ...]:
"""Build samples at REF_LON; second component is northward offset in meters."""
return tuple(
ife.PositionSample(
monotonic_ms=t_ms,
lat_deg=_offset_lat(REF_LAT, north_m),
lon_deg=REF_LON,
)
for t_ms, north_m in pairs
)
# ───────────────────────── BlackoutWindow ─────────────────────────
def test_window_in_spec_exact_30s_passes() -> None:
# Act
w = _window(30.0)
# Assert
assert w.duration_s == pytest.approx(30.0)
assert w.window_in_spec
@pytest.mark.parametrize("duration", [28.5, 31.5])
def test_window_in_spec_within_tolerance(duration: float) -> None:
# Assert
assert _window(duration).window_in_spec
@pytest.mark.parametrize("duration", [27.0, 33.0])
def test_window_in_spec_outside_tolerance(duration: float) -> None:
# Assert
assert not _window(duration).window_in_spec
# ───────────────────────── evaluate_subcase ─────────────────────────
def test_evaluate_subcase_unknown_subcase_raises() -> None:
# Assert
with pytest.raises(ValueError):
ife.evaluate_subcase(
_window(30.0),
estimates=_samples((90_000, 60.0)),
ground_truth=_samples((90_000, 0.0)),
subcase="some_typo",
)
def test_evaluate_subcase_no_imu_within_budget_passes() -> None:
# Arrange — 80 m drift at blackout end, budget is 100 m.
w = _window(30.0)
estimates = _samples((w.end_monotonic_ms, 80.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
# Assert
assert report.subcase == ife.SUBCASE_NO_IMU
assert report.budget_m == ife.NO_IMU_BUDGET_M
assert report.drift_m is not None
assert report.drift_m == pytest.approx(80.0, abs=0.5)
assert report.passes
def test_evaluate_subcase_no_imu_above_budget_fails() -> None:
# Arrange — 120 m drift, budget 100 m.
w = _window(30.0)
estimates = _samples((w.end_monotonic_ms, 120.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
# Assert
assert report.drift_m is not None
assert report.drift_m == pytest.approx(120.0, abs=0.5)
assert not report.passes
def test_evaluate_subcase_good_imu_within_budget_passes() -> None:
# Arrange — 40 m drift, budget 50 m.
w = _window(30.0)
estimates = _samples((w.end_monotonic_ms, 40.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
# Assert
assert report.budget_m == ife.GOOD_IMU_BUDGET_M
assert report.passes
def test_evaluate_subcase_good_imu_above_budget_fails() -> None:
# Arrange — 70 m drift, budget 50 m.
w = _window(30.0)
estimates = _samples((w.end_monotonic_ms, 70.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
# Assert
assert not report.passes
def test_evaluate_subcase_no_estimate_at_end_returns_none_drift() -> None:
# Arrange — only estimate sample is BEFORE blackout onset.
w = _window(30.0)
estimates = _samples((w.onset_monotonic_ms - 5_000, 0.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_NO_IMU)
# Assert — sample at onset5s is valid because it's ≤ end_ms, so drift is 0
# (estimate hasn't moved). The "no estimate" case is when nothing is ≤ end_ms.
assert report.estimate_at_end is not None # onset-5s ≤ end_ms
estimates_empty: tuple[ife.PositionSample, ...] = ()
report2 = ife.evaluate_subcase(w, estimates_empty, gt, subcase=ife.SUBCASE_NO_IMU)
assert report2.drift_m is None
assert not report2.passes
def test_evaluate_subcase_picks_latest_sample_at_or_before_end() -> None:
# Arrange — three estimate samples, only the second is at the boundary.
w = _window(30.0)
estimates = _samples(
(w.end_monotonic_ms - 1000, 30.0),
(w.end_monotonic_ms, 50.0),
(w.end_monotonic_ms + 1000, 200.0), # AFTER the window — must be ignored
)
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(w, estimates, gt, subcase=ife.SUBCASE_GOOD_IMU)
# Assert
assert report.estimate_at_end is not None
assert report.estimate_at_end.monotonic_ms == w.end_monotonic_ms
assert report.drift_m == pytest.approx(50.0, abs=0.5)
def test_evaluate_subcase_explicit_budget_overrides_default() -> None:
# Arrange — 60 m drift, default no_imu budget=100 (would pass); override to 40.
w = _window(30.0)
estimates = _samples((w.end_monotonic_ms, 60.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate_subcase(
w, estimates, gt, subcase=ife.SUBCASE_NO_IMU, budget_m=40.0
)
# Assert
assert report.budget_m == 40.0
assert not report.passes
# ───────────────────────── evaluate (aggregate) ─────────────────────────
def test_evaluate_both_subcases_pass() -> None:
# Arrange
w = _window(30.0)
no_imu_est = _samples((w.end_monotonic_ms, 80.0))
good_imu_est = _samples((w.end_monotonic_ms, 40.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate(
w,
sub_cases=[
(ife.SUBCASE_NO_IMU, no_imu_est, gt),
(ife.SUBCASE_GOOD_IMU, good_imu_est, gt),
],
)
# Assert
assert len(report.sub_cases) == 2
assert report.passes_window
assert report.passes
assert report.by_subcase(ife.SUBCASE_NO_IMU).passes
assert report.by_subcase(ife.SUBCASE_GOOD_IMU).passes
def test_evaluate_one_subcase_fails_fails_aggregate() -> None:
# Arrange
w = _window(30.0)
no_imu_est = _samples((w.end_monotonic_ms, 80.0)) # passes (≤100)
good_imu_est = _samples((w.end_monotonic_ms, 70.0)) # fails (>50)
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate(
w,
sub_cases=[
(ife.SUBCASE_NO_IMU, no_imu_est, gt),
(ife.SUBCASE_GOOD_IMU, good_imu_est, gt),
],
)
# Assert
assert not report.passes
def test_evaluate_window_out_of_spec_fails_even_if_drift_passes() -> None:
# Arrange — only 25 s window (under tolerance).
w = _window(25.0)
estimates = _samples((w.end_monotonic_ms, 10.0))
gt = _samples((w.end_monotonic_ms, 0.0))
# Act
report = ife.evaluate(
w, sub_cases=[(ife.SUBCASE_NO_IMU, estimates, gt)]
)
# Assert
assert not report.passes_window
assert not report.passes
def test_evaluate_by_subcase_unknown_raises() -> None:
# Arrange
w = _window(30.0)
report = ife.evaluate(w, sub_cases=[])
# Assert
with pytest.raises(KeyError):
report.by_subcase(ife.SUBCASE_NO_IMU)
# ───────────────────────── distance sanity ─────────────────────────
def test_helper_offset_lat_aligns_with_vincenty_within_meter() -> None:
"""Cross-check our convenience helper against the real Vincenty distance."""
# Arrange
target = _offset_lat(REF_LAT, 100.0)
# Act
d = distance_m(REF_LAT, REF_LON, target, REF_LON)
# Assert — flat-earth approximation is within ~1 m for 100 m offsets.
assert abs(d - 100.0) < 1.0
# ───────────────────────── csv emit ─────────────────────────
def test_write_csv_evidence_one_row_per_subcase(tmp_path: Path) -> None:
# Arrange
w = _window(30.0)
report = ife.evaluate(
w,
sub_cases=[
(ife.SUBCASE_NO_IMU, _samples((w.end_monotonic_ms, 80.0)), _samples((w.end_monotonic_ms, 0.0))),
(ife.SUBCASE_GOOD_IMU, _samples((w.end_monotonic_ms, 40.0)), _samples((w.end_monotonic_ms, 0.0))),
],
)
out_path = tmp_path / "nft-res-01.csv"
# Act
ife.write_csv_evidence(out_path, report)
# Assert
rows = out_path.read_text().splitlines()
assert rows[0].startswith("subcase,")
assert "drift_m" in rows[0]
assert len(rows) == 3 # header + 2 sub-cases
assert ife.SUBCASE_NO_IMU in rows[1] or ife.SUBCASE_NO_IMU in rows[2]
assert ife.SUBCASE_GOOD_IMU in rows[1] or ife.SUBCASE_GOOD_IMU in rows[2]
def test_write_csv_evidence_creates_parent_dir(tmp_path: Path) -> None:
# Arrange
w = _window(30.0)
report = ife.evaluate(w, sub_cases=[])
out_path = tmp_path / "nested" / "deep" / "nft-res-01.csv"
# Act
ife.write_csv_evidence(out_path, report)
# Assert
assert out_path.exists()
@@ -0,0 +1,227 @@
"""Unit tests for ``runner.helpers.monte_carlo_envelope_evaluator`` (AZ-434 / NFT-RES-03)."""
from __future__ import annotations
from pathlib import Path
import pytest
from runner.helpers import monte_carlo_envelope_evaluator as mce
def _iter(iter_id: str, seed: int, samples: list[tuple[float, float]]) -> mce.IterationOutcome:
return mce.IterationOutcome(
iteration_id=iter_id,
iteration_seed=seed,
samples=tuple(mce.FrameSample(error_m=e, cov_semi_major_m=c) for e, c in samples),
)
def _uniform_iter(iter_id: str, seed: int, *, n: int, error_m: float, cov_m: float) -> mce.IterationOutcome:
return _iter(iter_id, seed, [(error_m, cov_m) for _ in range(n)])
# ───────────────────────── AC-1 iteration count ─────────────────────────
def test_ac1_at_min_iterations_passes() -> None:
# Arrange — 100 iterations × 1 sample each, all covered.
iterations = [_uniform_iter(f"it{i:03d}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(100)]
# Act
report = mce.evaluate(iterations, master_seed=42)
# Assert
assert report.iteration_count == 100
assert report.passes_iteration_count
assert report.passes
def test_ac1_below_min_iterations_fails() -> None:
# Arrange — 99 iterations.
iterations = [_uniform_iter(f"it{i:03d}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(99)]
# Act
report = mce.evaluate(iterations, master_seed=42)
# Assert
assert report.iteration_count == 99
assert not report.passes_iteration_count
assert not report.passes
def test_ac1_custom_min_iteration_count() -> None:
# Arrange — 5 iterations with custom min=5.
iterations = [_uniform_iter(f"it{i}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(5)]
# Act
report = mce.evaluate(iterations, master_seed=1, min_iteration_count=5)
# Assert
assert report.passes_iteration_count
# ───────────────────────── AC-3 envelope ratio ─────────────────────────
def test_ac3_all_covered_passes() -> None:
# Arrange — error = 0.5 * cov, multiplier 1.96 → trivially covered.
iterations = [_uniform_iter(f"it{i}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(100)]
# Act
report = mce.evaluate(iterations, master_seed=0)
# Assert
assert report.total_samples == 1000
assert report.covered_samples == 1000
assert report.envelope_ratio == pytest.approx(1.0)
assert report.passes_envelope
def test_ac3_at_budget_passes() -> None:
# Arrange — 95 covered + 5 not covered = ratio 0.95 exactly.
covered = [_uniform_iter(f"c{i}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(95)]
uncovered = [_uniform_iter(f"u{i}", 1000 + i, n=1, error_m=3.0, cov_m=1.0) for i in range(5)]
# Act
report = mce.evaluate(covered + uncovered, master_seed=0)
# Assert — 3.0 > 1.96, so uncovered. 95/100 = 0.95 exactly.
assert report.envelope_ratio == pytest.approx(0.95)
assert report.passes_envelope
def test_ac3_below_budget_fails() -> None:
# Arrange — 90 covered + 10 not covered = ratio 0.90.
covered = [_uniform_iter(f"c{i}", i, n=1, error_m=0.5, cov_m=1.0) for i in range(90)]
uncovered = [_uniform_iter(f"u{i}", 1000 + i, n=1, error_m=3.0, cov_m=1.0) for i in range(10)]
# Act
report = mce.evaluate(covered + uncovered, master_seed=0)
# Assert
assert report.envelope_ratio == pytest.approx(0.90)
assert not report.passes_envelope
def test_ac3_edge_error_exactly_at_envelope_counts_as_covered() -> None:
# Arrange — error = 1.96 * cov should still count.
iterations = [_uniform_iter(f"it{i}", i, n=1, error_m=1.96, cov_m=1.0) for i in range(100)]
# Act
report = mce.evaluate(iterations, master_seed=0)
# Assert
assert report.envelope_ratio == pytest.approx(1.0)
def test_ac3_empty_iterations_returns_none_ratio() -> None:
# Act
report = mce.evaluate([], master_seed=0)
# Assert
assert report.total_samples == 0
assert report.envelope_ratio is None
assert not report.passes_envelope
assert not report.passes
def test_ac3_custom_envelope_multiplier() -> None:
# Arrange — error = 1.0, cov = 1.0; default multiplier 1.96 → covered;
# custom multiplier 0.5 → uncovered.
iterations = [_uniform_iter(f"it{i}", i, n=1, error_m=1.0, cov_m=1.0) for i in range(100)]
# Act
report = mce.evaluate(iterations, master_seed=0, envelope_multiplier=0.5)
# Assert — 1.0 > 0.5 * 1.0 = 0.5 → uncovered.
assert report.envelope_ratio == pytest.approx(0.0)
assert not report.passes_envelope
# ───────────────────────── AC-2 determinism ─────────────────────────
def test_iteration_hash_same_for_same_inputs() -> None:
# Arrange
it1 = _iter("it1", 7, [(0.5, 1.0), (0.6, 1.0), (0.4, 0.9)])
it2 = _iter("it1", 7, [(0.5, 1.0), (0.6, 1.0), (0.4, 0.9)])
# Assert
assert mce.iteration_hash(it1) == mce.iteration_hash(it2)
def test_iteration_hash_differs_when_samples_differ() -> None:
# Arrange
it1 = _iter("it1", 7, [(0.5, 1.0)])
it2 = _iter("it1", 7, [(0.6, 1.0)])
# Assert
assert mce.iteration_hash(it1) != mce.iteration_hash(it2)
def test_iteration_hash_differs_when_seed_differs() -> None:
# Arrange
it1 = _iter("it1", 7, [(0.5, 1.0)])
it2 = _iter("it1", 8, [(0.5, 1.0)])
# Assert
assert mce.iteration_hash(it1) != mce.iteration_hash(it2)
def test_determinism_fingerprint_stable_across_two_evaluate_calls() -> None:
# Arrange
iterations = [_uniform_iter(f"it{i:03d}", i, n=3, error_m=0.5, cov_m=1.0) for i in range(10)]
r1 = mce.evaluate(iterations, master_seed=42)
r2 = mce.evaluate(iterations, master_seed=42)
# Assert
assert mce.determinism_fingerprint(r1) == mce.determinism_fingerprint(r2)
def test_determinism_fingerprint_differs_when_master_seed_changes() -> None:
# Arrange
iterations = [_uniform_iter(f"it{i:03d}", i, n=3, error_m=0.5, cov_m=1.0) for i in range(10)]
r1 = mce.evaluate(iterations, master_seed=42)
r2 = mce.evaluate(iterations, master_seed=43)
# Assert
assert mce.determinism_fingerprint(r1) != mce.determinism_fingerprint(r2)
# ───────────────────────── csv emit ─────────────────────────
def test_write_csv_evidence_emits_summary(tmp_path: Path) -> None:
# Arrange
iterations = [_uniform_iter(f"it{i:03d}", i, n=10, error_m=0.5, cov_m=1.0) for i in range(100)]
report = mce.evaluate(iterations, master_seed=42)
out_path = tmp_path / "nft-res-03.csv"
# Act
mce.write_csv_evidence(out_path, report)
# Assert
rows = out_path.read_text().splitlines()
assert len(rows) == 2
assert rows[0].startswith("master_seed")
assert "ac1_iteration_count_passes" in rows[0]
assert "ac3_envelope_passes" in rows[0]
assert "fingerprint_sha256" in rows[0]
def test_write_per_iteration_csv_row_per_iter(tmp_path: Path) -> None:
# Arrange
iterations = [_uniform_iter(f"it{i}", i, n=2, error_m=0.5, cov_m=1.0) for i in range(5)]
report = mce.evaluate(iterations, master_seed=1, min_iteration_count=5)
out_path = tmp_path / "per-iter.csv"
# Act
mce.write_per_iteration_csv(out_path, report)
# Assert
rows = out_path.read_text().splitlines()
assert rows[0] == (
"iteration_id,iteration_seed,frame_count,covered_count,envelope_ratio,iteration_hash_sha256"
)
assert len(rows) == 6 # header + 5 iterations