[AZ-424] [AZ-425] [AZ-426] Implement negatives set (FT-N-01/03/04)

Adds three pure-logic evaluators + scenarios + unit tests covering the project's failure-mode robustness ladder (AC-3.1, AC-3.4, AC-3.5, AC-NEW-8): * outlier_tolerance_evaluator (AZ-424 / FT-N-01): per-event 50 m drift bound + 3-frame covariance-monotonic window over the AZ-408 outlier injector's medium-density manifest. * outage_request_evaluator (AZ-425 / FT-N-03): detects 3+ consecutive missing-frame windows; validates OPERATOR_RELOC_REQUEST STATUSTEXT arrives at 2 s ±500 ms, dead_reckoned label during outage, and no FC EKF divergence. * blackout_spoof_evaluator (AZ-426 / FT-N-04): eight-AC ladder across the 5 s / 15 s / 35 s sub-windows — switch latency, spoof rejection, monotonic covariance, honest horiz_accuracy, STATUSTEXT 1-2 Hz, 35 s escalation thresholds, and recovery gate. Each scenario is skip-gated on the AZ-441 / AZ-407 / AZ-416 replay / SITL / mavproxy helpers; unit tests (14 + 18 + 29 = 61) cover the AC logic today. Full e2e unit-test suite: 527 passed (+67). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 14:41:15 +00:00 · 2026-05-17 08:26:16 +03:00
parent a644debdb7
commit 2d6d44af5d
16 changed files with 3343 additions and 1 deletions
@@ -0,0 +1,588 @@
+"""Unit tests for `e2e/runner/helpers/blackout_spoof_evaluator.py` (AZ-426)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+from e2e.runner.helpers.blackout_spoof_evaluator import (
+    DEAD_RECKONED_LABEL,
+    ESCALATION_COV_2D_M,
+    ESCALATION_COV_FAILSAFE_M,
+    ESCALATION_DURATION_FAILSAFE_S,
+    ESCALATION_FIX_TYPE_2D,
+    ESCALATION_LATENCY_MS,
+    HONEST_ACCURACY_RATIO,
+    HORIZ_ACCURACY_FAILSAFE,
+    RECOVERY_STABLE_S,
+    SATELLITE_ANCHORED_LABEL,
+    STATUSTEXT_FAILSAFE,
+    STATUSTEXT_IMU_ONLY,
+    STATUSTEXT_RATE_MAX_HZ,
+    STATUSTEXT_RATE_MIN_HZ,
+    SWITCH_LATENCY_MS,
+    BlackoutWindow,
+    ConsistencyCheckEvent,
+    GpsHealthSample,
+    OutboundEstimateSample,
+    SpoofRejectedEvent,
+    StatustextSample,
+    evaluate,
+    evaluate_covariance_monotonic,
+    evaluate_escalation,
+    evaluate_honest_accuracy,
+    evaluate_recovery_gate,
+    evaluate_spoof_rejection,
+    evaluate_statustext_rate,
+    evaluate_switch_latency,
+    write_csv_evidence,
+)
+
+
+# Constants
+
+
+def test_constants_match_spec():
+    # AZ-426: AC-1 ≤400 ms, AC-4 ≥0.95×cov, AC-5 1-2 Hz, AC-6/7/8 thresholds.
+    assert SWITCH_LATENCY_MS == 400
+    assert HONEST_ACCURACY_RATIO == 0.95
+    assert STATUSTEXT_RATE_MIN_HZ == 1.0 and STATUSTEXT_RATE_MAX_HZ == 2.0
+    assert ESCALATION_COV_2D_M == 100.0
+    assert ESCALATION_COV_FAILSAFE_M == 500.0
+    assert ESCALATION_DURATION_FAILSAFE_S == 30.0
+    assert ESCALATION_FIX_TYPE_2D == 2
+    assert HORIZ_ACCURACY_FAILSAFE == 999.0
+    assert ESCALATION_LATENCY_MS == 500
+    assert RECOVERY_STABLE_S == 10.0
+    assert STATUSTEXT_IMU_ONLY == "VISUAL_BLACKOUT_IMU_ONLY"
+    assert STATUSTEXT_FAILSAFE == "VISUAL_BLACKOUT_FAILSAFE"
+    assert DEAD_RECKONED_LABEL == "dead_reckoned"
+    assert SATELLITE_ANCHORED_LABEL == "satellite_anchored"
+
+
+def _window(onset_ms: int = 10_000, duration_s: float = 5.0) -> BlackoutWindow:
+    return BlackoutWindow(
+        onset_monotonic_ms=onset_ms,
+        end_monotonic_ms=onset_ms + int(duration_s * 1000),
+    )
+
+
+def _est(
+    ms: int,
+    *,
+    label: str = DEAD_RECKONED_LABEL,
+    cov: float = 5.0,
+    horiz: float | None = None,
+    fix_type: int = 3,
+) -> OutboundEstimateSample:
+    return OutboundEstimateSample(
+        monotonic_ms=ms,
+        source_label=label,
+        cov_semi_major_m=cov,
+        horiz_accuracy=cov if horiz is None else horiz,
+        fix_type=fix_type,
+    )
+
+
+# AC-1 switch latency
+
+
+def test_switch_latency_within_400_ms_passes():
+    # Arrange
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms - 100, label=SATELLITE_ANCHORED_LABEL),
+        _est(w.onset_monotonic_ms + 350),
+    ]
+
+    # Act
+    report = evaluate_switch_latency(w, estimates, frame_period_ms=33)
+
+    # Assert — budget is min(400, 33) = 33 ms; 350 > 33 → fails.
+    assert report.first_dead_reckoned_offset_ms == 350
+    assert report.passes is False
+
+
+def test_switch_latency_within_one_frame_passes():
+    # Arrange — frame period 100 ms, dead_reckoned at +50 ms → within both bounds.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 50)]
+
+    # Act
+    report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_switch_latency_at_one_frame_boundary_passes():
+    # Arrange — exact frame-period boundary.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 100)]
+
+    # Act
+    report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_switch_latency_missing_dead_reckoned_fails():
+    # Arrange — no dead_reckoned emission.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 50, label=SATELLITE_ANCHORED_LABEL)]
+
+    # Act
+    report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
+
+    # Assert
+    assert report.first_dead_reckoned_offset_ms is None
+    assert report.passes is False
+
+
+# AC-2 spoof rejection
+
+
+def test_spoof_rejection_pass():
+    # Arrange — spoof events present, no satellite_anchored inside window.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 500)]
+    spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 200, reason="delta>500m")]
+
+    # Act
+    report = evaluate_spoof_rejection(w, estimates, spoof_events)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_spoof_rejection_no_events_fails():
+    # Arrange
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 500)]
+
+    # Act
+    report = evaluate_spoof_rejection(w, estimates, spoof_events=[])
+
+    # Assert
+    assert report.passes is False
+
+
+def test_spoof_rejection_label_returns_to_satellite_fails():
+    # Arrange — spoof event present BUT label returns to satellite_anchored inside window.
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 100),
+        _est(w.onset_monotonic_ms + 1000, label=SATELLITE_ANCHORED_LABEL),
+    ]
+    spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 50, reason="x")]
+
+    # Act
+    report = evaluate_spoof_rejection(w, estimates, spoof_events)
+
+    # Assert
+    assert report.satellite_anchored_inside_window == 1
+    assert report.passes is False
+
+
+# AC-3 covariance monotonic
+
+
+def test_covariance_monotonic_pass():
+    # Arrange
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 100, cov=5.0),
+        _est(w.onset_monotonic_ms + 200, cov=5.5),
+        _est(w.onset_monotonic_ms + 300, cov=6.0),
+    ]
+
+    # Act
+    report = evaluate_covariance_monotonic(w, estimates)
+
+    # Assert
+    assert report.passes is True
+    assert report.first_decreasing_at_ms is None
+
+
+def test_covariance_monotonic_decreasing_fails():
+    # Arrange
+    w = _window()
+    estimates = [
+        _est(w.onset_monotonic_ms + 100, cov=5.0),
+        _est(w.onset_monotonic_ms + 200, cov=4.0),
+    ]
+
+    # Act
+    report = evaluate_covariance_monotonic(w, estimates)
+
+    # Assert
+    assert report.first_decreasing_at_ms == w.onset_monotonic_ms + 200
+    assert report.passes is False
+
+
+# AC-4 honest accuracy
+
+
+def test_honest_accuracy_pass():
+    # Arrange — horiz_accuracy = cov ≥ 0.95 × cov.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=10.0)]
+
+    # Act
+    report = evaluate_honest_accuracy(w, estimates)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_honest_accuracy_boundary_pass():
+    # Arrange — horiz_accuracy = 0.95 × cov exactly.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=9.5)]
+
+    # Act
+    report = evaluate_honest_accuracy(w, estimates)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_honest_accuracy_violation_fails():
+    # Arrange — horiz_accuracy = 0.90 × cov.
+    w = _window()
+    estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=9.0)]
+
+    # Act
+    report = evaluate_honest_accuracy(w, estimates)
+
+    # Assert
+    assert report.violation_count == 1
+    assert report.passes is False
+
+
+# AC-5 STATUSTEXT rate
+
+
+def test_statustext_rate_pass_at_1hz():
+    # Arrange — 5 STATUSTEXTs over 5 s = 1 Hz.
+    w = _window(duration_s=5.0)
+    sts = [
+        StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 1000, text=STATUSTEXT_IMU_ONLY)
+        for i in range(5)
+    ]
+
+    # Act
+    report = evaluate_statustext_rate(w, sts)
+
+    # Assert
+    assert report.observed_hz is not None and abs(report.observed_hz - 1.0) < 1e-6
+    assert report.passes is True
+
+
+def test_statustext_rate_pass_at_2hz():
+    # Arrange — 10 STATUSTEXTs over 5 s = 2 Hz.
+    w = _window(duration_s=5.0)
+    sts = [
+        StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 500, text=STATUSTEXT_IMU_ONLY)
+        for i in range(10)
+    ]
+
+    # Act
+    report = evaluate_statustext_rate(w, sts)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_statustext_rate_too_slow_fails():
+    # Arrange — 2 STATUSTEXTs over 5 s = 0.4 Hz.
+    w = _window(duration_s=5.0)
+    sts = [
+        StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 2000, text=STATUSTEXT_IMU_ONLY)
+        for i in range(2)
+    ]
+
+    # Act
+    report = evaluate_statustext_rate(w, sts)
+
+    # Assert
+    assert report.passes is False
+
+
+def test_statustext_rate_too_fast_fails():
+    # Arrange — 30 STATUSTEXTs over 5 s = 6 Hz.
+    w = _window(duration_s=5.0)
+    sts = [
+        StatustextSample(monotonic_ms=w.onset_monotonic_ms + int(i * 5000 / 30), text=STATUSTEXT_IMU_ONLY)
+        for i in range(30)
+    ]
+
+    # Act
+    report = evaluate_statustext_rate(w, sts)
+
+    # Assert
+    assert report.observed_hz is not None and report.observed_hz > STATUSTEXT_RATE_MAX_HZ
+    assert report.passes is False
+
+
+# AC-6 / AC-7 escalation (35 s window)
+
+
+def _make_35s_window(onset_ms: int = 10_000) -> BlackoutWindow:
+    return _window(onset_ms=onset_ms, duration_s=35.0)
+
+
+def test_escalation_non_35s_window_passes_vacuously():
+    # Arrange — 5 s window with no escalation expected.
+    w = _window(duration_s=5.0)
+    estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0)]
+
+    # Act
+    report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=False)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_escalation_35s_no_crossings_passes():
+    # Arrange — covariance stays below ESCALATION_COV_2D_M for whole window.
+    w = _make_35s_window()
+    estimates = [
+        _est(w.onset_monotonic_ms + i * 1000, cov=10.0 + i * 0.1)
+        for i in range(30)
+    ]
+
+    # Act
+    report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=True)
+
+    # Assert — duration crossing at 30 s alone still triggers AC-7 path; no
+    # failsafe STATUSTEXT → fails AC-7.
+    assert report.cov500_or_30s_crossed is True
+    assert report.passes_ac7 is False
+
+
+def test_escalation_35s_ac6_fix_type_degraded_passes():
+    # Arrange — cov crosses 100 m at 5 s; fix_type drops to 2 from then on.
+    w = _make_35s_window()
+    estimates = []
+    for i in range(30):
+        t = w.onset_monotonic_ms + i * 1000
+        cov = 50.0 if i < 5 else 150.0
+        fix = 3 if i < 5 else 2
+        estimates.append(_est(t, cov=cov, horiz=cov, fix_type=fix))
+    # Provide failsafe STATUSTEXT at +30 s (within ESCALATION_LATENCY_MS of duration breach).
+    failsafe_at = w.onset_monotonic_ms + int(ESCALATION_DURATION_FAILSAFE_S * 1000)
+    statustexts = [
+        StatustextSample(monotonic_ms=failsafe_at + 100, text=STATUSTEXT_FAILSAFE)
+    ]
+    # All post-failsafe-trigger samples need horiz_accuracy == 999.
+    for i in range(30):
+        if estimates[i].monotonic_ms >= failsafe_at:
+            estimates[i] = OutboundEstimateSample(
+                monotonic_ms=estimates[i].monotonic_ms,
+                source_label=DEAD_RECKONED_LABEL,
+                cov_semi_major_m=estimates[i].cov_semi_major_m,
+                horiz_accuracy=HORIZ_ACCURACY_FAILSAFE,
+                fix_type=2,
+            )
+
+    # Act
+    report = evaluate_escalation(w, estimates, statustexts, is_35s_window=True)
+
+    # Assert
+    assert report.passes_ac6 is True
+    assert report.passes_ac7 is True
+
+
+def test_escalation_35s_ac6_fix_type_not_degraded_fails():
+    # Arrange — cov crosses 100 m but fix_type stays 3.
+    w = _make_35s_window()
+    estimates = []
+    for i in range(30):
+        cov = 50.0 if i < 5 else 150.0
+        estimates.append(_est(w.onset_monotonic_ms + i * 1000, cov=cov, fix_type=3))
+
+    # Act
+    report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=True)
+
+    # Assert
+    assert report.passes_ac6 is False
+
+
+def test_escalation_35s_ac7_horiz_not_999_fails():
+    # Arrange — failsafe trigger reached but horiz_accuracy != 999.
+    w = _make_35s_window()
+    estimates = []
+    for i in range(35):
+        cov = 50.0 + i * 20.0  # crosses 100 then 500.
+        estimates.append(_est(w.onset_monotonic_ms + i * 1000, cov=cov, horiz=cov, fix_type=2))
+    failsafe_at = w.onset_monotonic_ms + int(ESCALATION_DURATION_FAILSAFE_S * 1000)
+    statustexts = [
+        StatustextSample(monotonic_ms=failsafe_at + 100, text=STATUSTEXT_FAILSAFE)
+    ]
+
+    # Act
+    report = evaluate_escalation(w, estimates, statustexts, is_35s_window=True)
+
+    # Assert
+    assert report.horiz_accuracy_999 is False
+    assert report.passes_ac7 is False
+
+
+# AC-8 recovery gate
+
+
+def _post_window(w: BlackoutWindow) -> tuple[int, int]:
+    return w.end_monotonic_ms, w.end_monotonic_ms + int(RECOVERY_STABLE_S * 1000) + 500
+
+
+def test_recovery_gate_pass():
+    # Arrange — 12 s of healthy GPS + consistency pass + then satellite_anchored emission.
+    w = _window()
+    end, recovery = _post_window(w)
+    estimates = [
+        _est(end + 500),
+        _est(recovery + 100, label=SATELLITE_ANCHORED_LABEL),
+    ]
+    gps_health = [
+        GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
+        for i in range(12)
+    ]
+    consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
+
+    # Act
+    report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
+
+    # Assert
+    assert report.passes is True
+    assert report.recovery_at_ms == recovery + 100
+
+
+def test_recovery_gate_unstable_fails():
+    # Arrange — GPS goes unhealthy mid-stability window.
+    w = _window()
+    end, recovery = _post_window(w)
+    estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
+    gps_health = [
+        GpsHealthSample(monotonic_ms=end + i * 1000, healthy=(i != 5), spoofed=False)
+        for i in range(12)
+    ]
+    consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
+
+    # Act
+    report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
+
+    # Assert
+    assert report.passes is False
+
+
+def test_recovery_gate_spoofed_fails():
+    # Arrange — GPS healthy but spoofed=True for one sample.
+    w = _window()
+    end, recovery = _post_window(w)
+    estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
+    gps_health = [
+        GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=(i == 3))
+        for i in range(12)
+    ]
+    consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
+
+    # Act
+    report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
+
+    # Assert
+    assert report.passes is False
+
+
+def test_recovery_gate_no_consistency_check_fails():
+    # Arrange
+    w = _window()
+    end, recovery = _post_window(w)
+    estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
+    gps_health = [
+        GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
+        for i in range(12)
+    ]
+
+    # Act
+    report = evaluate_recovery_gate(w, estimates, gps_health, consistency_checks=[])
+
+    # Assert
+    assert report.consistency_check_passed is False
+    assert report.passes is False
+
+
+def test_recovery_gate_no_recovery_attempt_vacuous_pass():
+    # Arrange — no satellite_anchored post-window.
+    w = _window()
+    estimates = [_est(w.end_monotonic_ms + 500)]
+
+    # Act
+    report = evaluate_recovery_gate(w, estimates, gps_health=[], consistency_checks=[])
+
+    # Assert
+    assert report.recovery_at_ms is None
+    assert report.passes is True
+
+
+# Aggregate evaluate + CSV evidence
+
+
+def _make_passing_5s_inputs() -> dict:
+    w = _window(duration_s=5.0)
+    end, recovery = _post_window(w)
+    estimates = [_est(w.onset_monotonic_ms + 50)]
+    estimates.extend(
+        _est(w.onset_monotonic_ms + 100 + i * 100, cov=10.0 + i * 0.1)
+        for i in range(20)
+    )
+    estimates.append(_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL))
+    statustexts = [
+        StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 1000, text=STATUSTEXT_IMU_ONLY)
+        for i in range(5)
+    ]
+    spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 50, reason="x")]
+    gps_health = [
+        GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
+        for i in range(12)
+    ]
+    consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
+    return dict(
+        window=w,
+        estimates=estimates,
+        statustexts=statustexts,
+        spoof_events=spoof_events,
+        gps_health=gps_health,
+        consistency_checks=consistency,
+        frame_period_ms=100,
+        is_35s_window=False,
+    )
+
+
+def test_evaluate_5s_all_pass():
+    # Arrange
+    inputs = _make_passing_5s_inputs()
+
+    # Act
+    report = evaluate(**inputs)
+
+    # Assert
+    assert report.passes is True
+
+
+def test_write_csv_evidence_round_trips(tmp_path: Path):
+    # Arrange
+    inputs = _make_passing_5s_inputs()
+    report = evaluate(**inputs)
+
+    # Act
+    out = write_csv_evidence(tmp_path / "ft-n-04.csv", report)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.DictReader(fh))
+    assert len(rows) == 1
+    assert rows[0]["passes"] == "true"
+    assert rows[0]["ac1_passes"] == "true"
+    assert rows[0]["ac2_passes"] == "true"
@@ -0,0 +1,353 @@
+"""Unit tests for `e2e/runner/helpers/outage_request_evaluator.py` (AZ-425)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+from e2e.runner.helpers.outage_request_evaluator import (
+    DEAD_RECKONED_LABEL,
+    MIN_OUTAGE_FRAMES,
+    OUTAGE_THRESHOLD_S,
+    STATUSTEXT_REGEX,
+    TOLERANCE_S,
+    EkfDivergenceEvent,
+    OutboundEstimateSample,
+    StatustextSample,
+    detect_outage_windows,
+    evaluate,
+    evaluate_window,
+    write_csv_evidence,
+)
+
+
+# Constants
+
+
+def test_constants_match_spec():
+    # AZ-425: AC-1 ≥3 frames; AC-2 2 s ±500 ms; AC-3 dead_reckoned label.
+    assert MIN_OUTAGE_FRAMES == 3
+    assert OUTAGE_THRESHOLD_S == 2.0
+    assert TOLERANCE_S == 0.5
+    assert STATUSTEXT_REGEX == "OPERATOR_RELOC_REQUEST"
+    assert DEAD_RECKONED_LABEL == "dead_reckoned"
+
+
+# detect_outage_windows
+
+
+def _est(frame: int, label: str = "satellite_anchored", ms: int = 0) -> OutboundEstimateSample:
+    return OutboundEstimateSample(
+        frame_idx=frame,
+        monotonic_ms=ms if ms else frame * 100,
+        source_label=label,
+    )
+
+
+def test_detect_no_outage_returns_empty():
+    # Arrange — full frame sequence with all estimates.
+    expected = list(range(10))
+    estimates = [_est(i) for i in expected]
+
+    # Act
+    windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
+
+    # Assert
+    assert windows == []
+
+
+def test_detect_run_below_min_length_ignored():
+    # Arrange — 2-frame gap is below MIN_OUTAGE_FRAMES=3.
+    expected = list(range(10))
+    estimates = [_est(i) for i in expected if i not in (4, 5)]
+
+    # Act
+    windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
+
+    # Assert
+    assert windows == []
+
+
+def test_detect_single_outage_window():
+    # Arrange — 3-frame gap at indices 4,5,6.
+    expected = list(range(10))
+    estimates = [_est(i) for i in expected if i not in (4, 5, 6)]
+
+    # Act
+    windows = detect_outage_windows(
+        expected, estimates, frame_period_ms=100, replay_start_monotonic_ms=1000
+    )
+
+    # Assert
+    assert len(windows) == 1
+    w = windows[0]
+    assert w.first_missing_frame_idx == 4
+    assert w.last_missing_frame_idx == 6
+    assert w.length_frames == 3
+    assert w.onset_monotonic_ms == 1000 + 4 * 100  # 1400
+    assert w.end_monotonic_ms == 1000 + 7 * 100  # 1700
+    assert w.duration_ms == 300
+
+
+def test_detect_multiple_windows():
+    # Arrange — two gaps: 4-6 and 12-15.
+    expected = list(range(20))
+    skip = {4, 5, 6, 12, 13, 14, 15}
+    estimates = [_est(i) for i in expected if i not in skip]
+
+    # Act
+    windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
+
+    # Assert
+    assert len(windows) == 2
+    assert windows[0].first_missing_frame_idx == 4 and windows[0].length_frames == 3
+    assert windows[1].first_missing_frame_idx == 12 and windows[1].length_frames == 4
+
+
+def test_detect_trailing_outage_window():
+    # Arrange — gap at the end of the sequence.
+    expected = list(range(10))
+    estimates = [_est(i) for i in expected if i < 7]
+
+    # Act
+    windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
+
+    # Assert
+    assert len(windows) == 1
+    assert windows[0].first_missing_frame_idx == 7
+    assert windows[0].last_missing_frame_idx == 9
+
+
+# evaluate_window — AC-2 STATUSTEXT timing
+
+
+def _window_at(onset_ms: int, length: int = 3, period_ms: int = 100):
+    # Ensure expected sequence is long enough to fully contain the gap + a trailing frame.
+    total = max(20, length + 5)
+    expected = list(range(total))
+    skip = set(range(2, 2 + length))
+    estimates = [_est(i, ms=i * period_ms) for i in expected if i not in skip]
+    [w] = detect_outage_windows(
+        expected,
+        estimates,
+        frame_period_ms=period_ms,
+        replay_start_monotonic_ms=onset_ms - 2 * period_ms,
+    )
+    return w, estimates
+
+
+def test_statustext_within_tolerance_passes():
+    # Arrange — STATUSTEXT exactly at onset+2 s.
+    window, estimates = _window_at(onset_ms=10_000, length=30, period_ms=100)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="OPERATOR_RELOC_REQUEST"),
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Assert
+    assert report.statustext_offset_ms == 2000
+    assert report.passes_statustext is True
+
+
+def test_statustext_within_tolerance_late_passes():
+    # Arrange — STATUSTEXT at onset+2.4 s (within ±500 ms).
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2400, text="OPERATOR_RELOC_REQUEST"),
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Assert
+    assert report.passes_statustext is True
+
+
+def test_statustext_too_early_fails():
+    # Arrange — STATUSTEXT at onset+1.0 s (before 1.5 s lower bound).
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 1000, text="OPERATOR_RELOC_REQUEST"),
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Assert
+    assert report.statustext_offset_ms == 1000
+    assert report.passes_statustext is False
+
+
+def test_statustext_too_late_fails():
+    # Arrange — STATUSTEXT at onset+3.0 s (beyond 2.5 s upper bound).
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 3000, text="OPERATOR_RELOC_REQUEST"),
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Assert
+    assert report.passes_statustext is False
+
+
+def test_statustext_missing_fails():
+    # Arrange
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts=[], ekf_events=[])
+
+    # Assert
+    assert report.statustext_offset_ms is None
+    assert report.passes_statustext is False
+
+
+def test_statustext_payload_mismatch_fails():
+    # Arrange — different STATUSTEXT message at the right time.
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="EKF_VARIANCE"),
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Assert
+    assert report.passes_statustext is False
+
+
+# AC-3 dead_reckoned during outage
+
+
+def test_dead_reckoned_during_window_passes():
+    # Arrange — outage 4-6 with dead_reckoned estimate at ms 500 (frame 5 in window).
+    expected = list(range(20))
+    skip = {4, 5, 6}
+    estimates = [
+        _est(i, ms=i * 100)
+        for i in expected
+        if i not in skip
+    ]
+    # Add dead_reckoned filler emission during the outage window.
+    estimates.append(
+        OutboundEstimateSample(frame_idx=4, monotonic_ms=500, source_label=DEAD_RECKONED_LABEL)
+    )
+    [w] = detect_outage_windows(expected, [e for e in estimates if e.frame_idx not in {4, 5, 6} or e.source_label == "satellite_anchored"], frame_period_ms=100)
+    # Note: detection ignores dead_reckoned filler so window still spans 4-6.
+
+    # Act
+    report = evaluate_window(w, estimates, statustexts=[], ekf_events=[])
+
+    # Assert — at least one dead_reckoned emission with monotonic_ms in [onset_ms, end_ms].
+    assert report.dead_reckoned_count >= 1
+    assert report.passes_dead_reckoned is True
+
+
+def test_dead_reckoned_absent_fails():
+    # Arrange
+    window, estimates = _window_at(onset_ms=10_000, length=3, period_ms=100)
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts=[], ekf_events=[])
+
+    # Assert
+    assert report.dead_reckoned_count == 0
+    assert report.passes_dead_reckoned is False
+
+
+# AC-4 EKF divergence
+
+
+def test_ekf_divergence_during_window_fails():
+    # Arrange
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    events = [
+        EkfDivergenceEvent(
+            monotonic_ms=window.onset_monotonic_ms + 1000, reason="velocity_innov"
+        )
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts=[], ekf_events=events)
+
+    # Assert
+    assert report.ekf_divergence_count == 1
+    assert report.passes_ekf is False
+
+
+def test_ekf_divergence_outside_window_ignored():
+    # Arrange
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    events = [
+        EkfDivergenceEvent(
+            monotonic_ms=window.end_monotonic_ms + 1000, reason="velocity_innov"
+        )
+    ]
+
+    # Act
+    report = evaluate_window(window, estimates, statustexts=[], ekf_events=events)
+
+    # Assert
+    assert report.passes_ekf is True
+
+
+# evaluate aggregate
+
+
+def test_evaluate_all_pass():
+    # Arrange — single outage with everything in order.
+    expected = list(range(40))
+    skip = set(range(10, 40))
+    period_ms = 100
+    estimates = [
+        _est(i, ms=i * period_ms)
+        for i in expected
+        if i not in skip
+    ]
+    estimates.append(
+        OutboundEstimateSample(
+            frame_idx=10, monotonic_ms=10 * period_ms + 500, source_label=DEAD_RECKONED_LABEL
+        )
+    )
+    statustexts = [
+        StatustextSample(monotonic_ms=10 * period_ms + 2000, text="OPERATOR_RELOC_REQUEST")
+    ]
+
+    # Act
+    reports = evaluate(
+        expected,
+        estimates,
+        statustexts,
+        ekf_events=[],
+        frame_period_ms=period_ms,
+    )
+
+    # Assert
+    assert len(reports) == 1
+    assert reports[0].passes is True
+
+
+# CSV evidence
+
+
+def test_write_csv_evidence_round_trips(tmp_path: Path):
+    # Arrange
+    window, estimates = _window_at(onset_ms=10_000, length=30)
+    statustexts = [
+        StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="OPERATOR_RELOC_REQUEST")
+    ]
+    report = evaluate_window(window, estimates, statustexts, ekf_events=[])
+
+    # Act
+    out = write_csv_evidence(tmp_path / "ft-n-03.csv", [report])
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.DictReader(fh))
+    assert len(rows) == 1
+    assert rows[0]["passes_statustext"] == "true"
+    assert int(rows[0]["length_frames"]) == 30
@@ -0,0 +1,330 @@
+"""Unit tests for `e2e/runner/helpers/outlier_tolerance_evaluator.py` (AZ-424)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from e2e.runner.helpers.outlier_tolerance_evaluator import (
+    COVARIANCE_WINDOW_FRAMES,
+    DRIFT_BUDGET_M,
+    MIN_OUTLIER_COUNT,
+    GtPose,
+    OutboundEstimate,
+    OutlierEvent,
+    evaluate,
+    evaluate_event,
+    load_outlier_manifest,
+    write_csv_evidence,
+)
+
+
+# Constants
+
+
+def test_constants_match_spec():
+    # AC-2 budget + AC-3 window + AC-1 minimum count, per AZ-424.
+    assert DRIFT_BUDGET_M == 50.0
+    assert COVARIANCE_WINDOW_FRAMES == 3
+    assert MIN_OUTLIER_COUNT == 10
+
+
+# Manifest loading
+
+
+def _write_manifest(path: Path, rows: list[dict]) -> None:
+    fieldnames = [
+        "frame_idx",
+        "src_jpeg_path",
+        "replacement_tile_x",
+        "replacement_tile_y",
+        "geodesic_offset_m",
+        "seed",
+    ]
+    with path.open("w", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in rows:
+            row = {k: "" for k in fieldnames}
+            row.update(r)
+            writer.writerow(row)
+
+
+def test_load_outlier_manifest_missing_file_raises(tmp_path: Path):
+    # Assert
+    with pytest.raises(FileNotFoundError, match="outlier manifest not found"):
+        load_outlier_manifest(tmp_path / "nope.csv")
+
+
+def test_load_outlier_manifest_missing_columns_raises(tmp_path: Path):
+    # Arrange
+    p = tmp_path / "manifest.csv"
+    with p.open("w", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=["frame_idx", "src_jpeg_path"])
+        writer.writeheader()
+        writer.writerow({"frame_idx": "1", "src_jpeg_path": "x.jpg"})
+
+    # Assert
+    with pytest.raises(ValueError, match="missing required columns"):
+        load_outlier_manifest(p)
+
+
+def test_load_outlier_manifest_returns_events(tmp_path: Path):
+    # Arrange
+    p = tmp_path / "manifest.csv"
+    _write_manifest(
+        p,
+        [
+            {"frame_idx": "10", "src_jpeg_path": "AD000011.jpg", "geodesic_offset_m": "412.5"},
+            {"frame_idx": "20", "src_jpeg_path": "AD000021.jpg", "geodesic_offset_m": "381.0"},
+        ],
+    )
+
+    # Act
+    events = load_outlier_manifest(p)
+
+    # Assert
+    assert len(events) == 2
+    assert events[0] == OutlierEvent(
+        frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="AD000011.jpg"
+    )
+    assert events[1].frame_idx == 20
+
+
+# evaluate_event — AC-2 drift bound
+
+
+def _est(frame: int, lat: float, lon: float, cov: float = 5.0) -> OutboundEstimate:
+    return OutboundEstimate(
+        frame_idx=frame,
+        monotonic_ms=frame * 100,
+        lat_deg=lat,
+        lon_deg=lon,
+        cov_semi_major_m=cov,
+        source_label="C3_VIO",
+    )
+
+
+def _gt(frame: int, lat: float, lon: float) -> GtPose:
+    return GtPose(frame_idx=frame, lat_deg=lat, lon_deg=lon)
+
+
+def test_evaluate_event_drift_within_budget():
+    # Arrange — estimate before/after match GT exactly; outlier frame drifts.
+    estimates = {
+        9: _est(9, 50.0000, 30.0000, cov=4.0),
+        10: _est(10, 50.0050, 30.0050, cov=5.0),  # outlier
+        11: _est(11, 50.0001, 30.0001, cov=5.0),
+    }
+    gt = {
+        9: _gt(9, 50.0000, 30.0000),
+        10: _gt(10, 50.0001, 30.0001),
+        11: _gt(11, 50.0002, 30.0002),
+    }
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.frame_idx == 10
+    assert report.drift_m is not None
+    assert report.drift_m <= DRIFT_BUDGET_M
+    assert report.passes_drift is True
+
+
+def test_evaluate_event_drift_exceeds_budget_fails():
+    # Arrange — after-frame error is >> before-frame error.
+    estimates = {
+        9: _est(9, 50.0000, 30.0000),
+        10: _est(10, 50.0050, 30.0050),
+        11: _est(11, 50.0010, 30.0010),  # ~129 m off
+    }
+    gt = {
+        9: _gt(9, 50.0000, 30.0000),
+        10: _gt(10, 50.0001, 30.0001),
+        11: _gt(11, 50.0000, 30.0000),
+    }
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.drift_m is not None and report.drift_m > DRIFT_BUDGET_M
+    assert report.passes_drift is False
+    assert report.passes is False
+
+
+def test_evaluate_event_missing_neighbour_drift_none():
+    # Arrange — only outlier frame present.
+    estimates = {10: _est(10, 50.0050, 30.0050)}
+    gt = {10: _gt(10, 50.0001, 30.0001)}
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.drift_m is None
+    assert report.passes_drift is False
+
+
+# evaluate_event — AC-3 covariance monotonic
+
+
+def test_evaluate_event_cov_monotonic_passes():
+    # Arrange
+    estimates = {
+        9: _est(9, 50.0, 30.0, cov=4.0),
+        10: _est(10, 50.0, 30.0, cov=5.0),
+        11: _est(11, 50.0, 30.0, cov=5.5),
+    }
+    gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.cov_non_decreasing is True
+    assert report.passes_covariance is True
+
+
+def test_evaluate_event_cov_decreasing_fails():
+    # Arrange — outlier frame cov is lower than before frame.
+    estimates = {
+        9: _est(9, 50.0, 30.0, cov=5.0),
+        10: _est(10, 50.0, 30.0, cov=4.0),  # decrease — violates AC-3
+        11: _est(11, 50.0, 30.0, cov=5.0),
+    }
+    gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.cov_non_decreasing is False
+    assert report.passes_covariance is False
+
+
+def test_evaluate_event_cov_flat_window_passes():
+    # Arrange — equal covariances satisfy non-decreasing.
+    estimates = {
+        9: _est(9, 50.0, 30.0, cov=5.0),
+        10: _est(10, 50.0, 30.0, cov=5.0),
+        11: _est(11, 50.0, 30.0, cov=5.0),
+    }
+    gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
+    event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
+
+    # Act
+    report = evaluate_event(event, estimates, gt)
+
+    # Assert
+    assert report.cov_non_decreasing is True
+
+
+# Aggregate evaluate — AC-1 minimum count
+
+
+def test_evaluate_count_below_minimum_fails():
+    # Arrange — only 5 outliers; AC-1 requires ≥10.
+    events = [
+        OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
+        for i in range(1, 6)
+    ]
+    estimates: list[OutboundEstimate] = []
+    gt: list[GtPose] = []
+    for ev in events:
+        for delta in (-1, 0, 1):
+            estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
+            gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
+
+    # Act
+    report = evaluate(events, estimates, gt)
+
+    # Assert
+    assert report.total_outliers == 5
+    assert report.passes_count is False
+    assert report.passes is False
+
+
+def test_evaluate_count_at_minimum_passes_count_gate():
+    # Arrange — exactly 10 outliers with non-violating drift/cov.
+    events = [
+        OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
+        for i in range(1, 11)
+    ]
+    estimates: list[OutboundEstimate] = []
+    gt: list[GtPose] = []
+    for ev in events:
+        for delta in (-1, 0, 1):
+            estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
+            gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
+
+    # Act
+    report = evaluate(events, estimates, gt)
+
+    # Assert
+    assert report.total_outliers == 10
+    assert report.passes_count is True
+    assert report.failed_event_count == 0
+    assert report.passes is True
+
+
+def test_evaluate_mixed_pass_fail_aggregates_correctly():
+    # Arrange — 10 events, one with drift violation.
+    events = [
+        OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
+        for i in range(1, 11)
+    ]
+    estimates: list[OutboundEstimate] = []
+    gt: list[GtPose] = []
+    for ev in events:
+        for delta in (-1, 0, 1):
+            estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
+            gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
+    # Override frame 31 to be 200 m off — produces drift > 50 m for event at frame_idx=30.
+    estimates = [e for e in estimates if e.frame_idx != 31]
+    estimates.append(_est(31, 50.0018, 30.0, cov=5.0))  # ≈200 m off
+
+    # Act
+    report = evaluate(events, estimates, gt)
+
+    # Assert
+    assert report.total_outliers == 10
+    assert report.failed_event_count == 1
+    assert report.passes is False
+
+
+# CSV evidence writer
+
+
+def test_write_csv_evidence_round_trips(tmp_path: Path):
+    # Arrange
+    events = [
+        OutlierEvent(frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="AD000011.jpg"),
+        OutlierEvent(frame_idx=20, geodesic_offset_m=381.0, src_jpeg_path="AD000021.jpg"),
+    ]
+    estimates: list[OutboundEstimate] = []
+    gt: list[GtPose] = []
+    for ev in events:
+        for delta in (-1, 0, 1):
+            estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
+            gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
+    report = evaluate(events, estimates, gt)
+
+    # Act
+    out = write_csv_evidence(tmp_path / "ft_n_01_evidence.csv", report)
+
+    # Assert
+    assert out.exists()
+    with out.open() as fh:
+        rows = list(csv.DictReader(fh))
+    assert [int(r["frame_idx"]) for r in rows] == [10, 20]
+    assert all(r["passes"] == "true" for r in rows)
+    assert all(r["cov_non_decreasing"] == "true" for r in rows)
@@ -52,6 +52,9 @@ E2E_ROOT = Path(__file__).resolve().parents[1]
        "runner/helpers/msp_frame_observer.py",
        "runner/helpers/ap_contract_evaluator.py",
        "runner/helpers/cold_start_evaluator.py",
+        "runner/helpers/outlier_tolerance_evaluator.py",
+        "runner/helpers/outage_request_evaluator.py",
+        "runner/helpers/blackout_spoof_evaluator.py",
        "fixtures/mock-suite-sat/Dockerfile",
        "fixtures/mock-suite-sat/app.py",
        "fixtures/mock-suite-sat/requirements.txt",
@@ -96,7 +99,10 @@ E2E_ROOT = Path(__file__).resolve().parents[1]
        "tests/positive/test_ft_p_09_inav.py",
        "tests/positive/test_ft_p_10_smoothing_lookback.py",
        "tests/positive/test_ft_p_11_cold_start_init.py",
+        "tests/negative/test_ft_n_01_outlier_tolerance.py",
        "tests/negative/test_ft_n_02_sharp_turn_failure.py",
+        "tests/negative/test_ft_n_03_outage_reloc.py",
+        "tests/negative/test_ft_n_04_blackout_spoof.py",
    ],
 )
 def test_required_path_exists(relative_path: str) -> None: