[AZ-424] [AZ-425] [AZ-426] Implement negatives set (FT-N-01/03/04)

Adds three pure-logic evaluators + scenarios + unit tests covering the
project's failure-mode robustness ladder (AC-3.1, AC-3.4, AC-3.5,
AC-NEW-8):

* outlier_tolerance_evaluator (AZ-424 / FT-N-01): per-event 50 m drift
  bound + 3-frame covariance-monotonic window over the AZ-408 outlier
  injector's medium-density manifest.
* outage_request_evaluator (AZ-425 / FT-N-03): detects 3+ consecutive
  missing-frame windows; validates OPERATOR_RELOC_REQUEST STATUSTEXT
  arrives at 2 s ±500 ms, dead_reckoned label during outage, and no
  FC EKF divergence.
* blackout_spoof_evaluator (AZ-426 / FT-N-04): eight-AC ladder across
  the 5 s / 15 s / 35 s sub-windows — switch latency, spoof rejection,
  monotonic covariance, honest horiz_accuracy, STATUSTEXT 1-2 Hz,
  35 s escalation thresholds, and recovery gate.

Each scenario is skip-gated on the AZ-441 / AZ-407 / AZ-416 replay /
SITL / mavproxy helpers; unit tests (14 + 18 + 29 = 61) cover the
AC logic today. Full e2e unit-test suite: 527 passed (+67).

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-17 08:26:16 +03:00
parent a644debdb7
commit 2d6d44af5d
16 changed files with 3343 additions and 1 deletions
@@ -0,0 +1,588 @@
"""Unit tests for `e2e/runner/helpers/blackout_spoof_evaluator.py` (AZ-426)."""
from __future__ import annotations
import csv
from pathlib import Path
from e2e.runner.helpers.blackout_spoof_evaluator import (
DEAD_RECKONED_LABEL,
ESCALATION_COV_2D_M,
ESCALATION_COV_FAILSAFE_M,
ESCALATION_DURATION_FAILSAFE_S,
ESCALATION_FIX_TYPE_2D,
ESCALATION_LATENCY_MS,
HONEST_ACCURACY_RATIO,
HORIZ_ACCURACY_FAILSAFE,
RECOVERY_STABLE_S,
SATELLITE_ANCHORED_LABEL,
STATUSTEXT_FAILSAFE,
STATUSTEXT_IMU_ONLY,
STATUSTEXT_RATE_MAX_HZ,
STATUSTEXT_RATE_MIN_HZ,
SWITCH_LATENCY_MS,
BlackoutWindow,
ConsistencyCheckEvent,
GpsHealthSample,
OutboundEstimateSample,
SpoofRejectedEvent,
StatustextSample,
evaluate,
evaluate_covariance_monotonic,
evaluate_escalation,
evaluate_honest_accuracy,
evaluate_recovery_gate,
evaluate_spoof_rejection,
evaluate_statustext_rate,
evaluate_switch_latency,
write_csv_evidence,
)
# Constants
def test_constants_match_spec():
# AZ-426: AC-1 ≤400 ms, AC-4 ≥0.95×cov, AC-5 1-2 Hz, AC-6/7/8 thresholds.
assert SWITCH_LATENCY_MS == 400
assert HONEST_ACCURACY_RATIO == 0.95
assert STATUSTEXT_RATE_MIN_HZ == 1.0 and STATUSTEXT_RATE_MAX_HZ == 2.0
assert ESCALATION_COV_2D_M == 100.0
assert ESCALATION_COV_FAILSAFE_M == 500.0
assert ESCALATION_DURATION_FAILSAFE_S == 30.0
assert ESCALATION_FIX_TYPE_2D == 2
assert HORIZ_ACCURACY_FAILSAFE == 999.0
assert ESCALATION_LATENCY_MS == 500
assert RECOVERY_STABLE_S == 10.0
assert STATUSTEXT_IMU_ONLY == "VISUAL_BLACKOUT_IMU_ONLY"
assert STATUSTEXT_FAILSAFE == "VISUAL_BLACKOUT_FAILSAFE"
assert DEAD_RECKONED_LABEL == "dead_reckoned"
assert SATELLITE_ANCHORED_LABEL == "satellite_anchored"
def _window(onset_ms: int = 10_000, duration_s: float = 5.0) -> BlackoutWindow:
return BlackoutWindow(
onset_monotonic_ms=onset_ms,
end_monotonic_ms=onset_ms + int(duration_s * 1000),
)
def _est(
ms: int,
*,
label: str = DEAD_RECKONED_LABEL,
cov: float = 5.0,
horiz: float | None = None,
fix_type: int = 3,
) -> OutboundEstimateSample:
return OutboundEstimateSample(
monotonic_ms=ms,
source_label=label,
cov_semi_major_m=cov,
horiz_accuracy=cov if horiz is None else horiz,
fix_type=fix_type,
)
# AC-1 switch latency
def test_switch_latency_within_400_ms_passes():
# Arrange
w = _window()
estimates = [
_est(w.onset_monotonic_ms - 100, label=SATELLITE_ANCHORED_LABEL),
_est(w.onset_monotonic_ms + 350),
]
# Act
report = evaluate_switch_latency(w, estimates, frame_period_ms=33)
# Assert — budget is min(400, 33) = 33 ms; 350 > 33 → fails.
assert report.first_dead_reckoned_offset_ms == 350
assert report.passes is False
def test_switch_latency_within_one_frame_passes():
# Arrange — frame period 100 ms, dead_reckoned at +50 ms → within both bounds.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 50)]
# Act
report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
# Assert
assert report.passes is True
def test_switch_latency_at_one_frame_boundary_passes():
# Arrange — exact frame-period boundary.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 100)]
# Act
report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
# Assert
assert report.passes is True
def test_switch_latency_missing_dead_reckoned_fails():
# Arrange — no dead_reckoned emission.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 50, label=SATELLITE_ANCHORED_LABEL)]
# Act
report = evaluate_switch_latency(w, estimates, frame_period_ms=100)
# Assert
assert report.first_dead_reckoned_offset_ms is None
assert report.passes is False
# AC-2 spoof rejection
def test_spoof_rejection_pass():
# Arrange — spoof events present, no satellite_anchored inside window.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 500)]
spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 200, reason="delta>500m")]
# Act
report = evaluate_spoof_rejection(w, estimates, spoof_events)
# Assert
assert report.passes is True
def test_spoof_rejection_no_events_fails():
# Arrange
w = _window()
estimates = [_est(w.onset_monotonic_ms + 500)]
# Act
report = evaluate_spoof_rejection(w, estimates, spoof_events=[])
# Assert
assert report.passes is False
def test_spoof_rejection_label_returns_to_satellite_fails():
# Arrange — spoof event present BUT label returns to satellite_anchored inside window.
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 100),
_est(w.onset_monotonic_ms + 1000, label=SATELLITE_ANCHORED_LABEL),
]
spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 50, reason="x")]
# Act
report = evaluate_spoof_rejection(w, estimates, spoof_events)
# Assert
assert report.satellite_anchored_inside_window == 1
assert report.passes is False
# AC-3 covariance monotonic
def test_covariance_monotonic_pass():
# Arrange
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 100, cov=5.0),
_est(w.onset_monotonic_ms + 200, cov=5.5),
_est(w.onset_monotonic_ms + 300, cov=6.0),
]
# Act
report = evaluate_covariance_monotonic(w, estimates)
# Assert
assert report.passes is True
assert report.first_decreasing_at_ms is None
def test_covariance_monotonic_decreasing_fails():
# Arrange
w = _window()
estimates = [
_est(w.onset_monotonic_ms + 100, cov=5.0),
_est(w.onset_monotonic_ms + 200, cov=4.0),
]
# Act
report = evaluate_covariance_monotonic(w, estimates)
# Assert
assert report.first_decreasing_at_ms == w.onset_monotonic_ms + 200
assert report.passes is False
# AC-4 honest accuracy
def test_honest_accuracy_pass():
# Arrange — horiz_accuracy = cov ≥ 0.95 × cov.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=10.0)]
# Act
report = evaluate_honest_accuracy(w, estimates)
# Assert
assert report.passes is True
def test_honest_accuracy_boundary_pass():
# Arrange — horiz_accuracy = 0.95 × cov exactly.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=9.5)]
# Act
report = evaluate_honest_accuracy(w, estimates)
# Assert
assert report.passes is True
def test_honest_accuracy_violation_fails():
# Arrange — horiz_accuracy = 0.90 × cov.
w = _window()
estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0, horiz=9.0)]
# Act
report = evaluate_honest_accuracy(w, estimates)
# Assert
assert report.violation_count == 1
assert report.passes is False
# AC-5 STATUSTEXT rate
def test_statustext_rate_pass_at_1hz():
# Arrange — 5 STATUSTEXTs over 5 s = 1 Hz.
w = _window(duration_s=5.0)
sts = [
StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 1000, text=STATUSTEXT_IMU_ONLY)
for i in range(5)
]
# Act
report = evaluate_statustext_rate(w, sts)
# Assert
assert report.observed_hz is not None and abs(report.observed_hz - 1.0) < 1e-6
assert report.passes is True
def test_statustext_rate_pass_at_2hz():
# Arrange — 10 STATUSTEXTs over 5 s = 2 Hz.
w = _window(duration_s=5.0)
sts = [
StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 500, text=STATUSTEXT_IMU_ONLY)
for i in range(10)
]
# Act
report = evaluate_statustext_rate(w, sts)
# Assert
assert report.passes is True
def test_statustext_rate_too_slow_fails():
# Arrange — 2 STATUSTEXTs over 5 s = 0.4 Hz.
w = _window(duration_s=5.0)
sts = [
StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 2000, text=STATUSTEXT_IMU_ONLY)
for i in range(2)
]
# Act
report = evaluate_statustext_rate(w, sts)
# Assert
assert report.passes is False
def test_statustext_rate_too_fast_fails():
# Arrange — 30 STATUSTEXTs over 5 s = 6 Hz.
w = _window(duration_s=5.0)
sts = [
StatustextSample(monotonic_ms=w.onset_monotonic_ms + int(i * 5000 / 30), text=STATUSTEXT_IMU_ONLY)
for i in range(30)
]
# Act
report = evaluate_statustext_rate(w, sts)
# Assert
assert report.observed_hz is not None and report.observed_hz > STATUSTEXT_RATE_MAX_HZ
assert report.passes is False
# AC-6 / AC-7 escalation (35 s window)
def _make_35s_window(onset_ms: int = 10_000) -> BlackoutWindow:
return _window(onset_ms=onset_ms, duration_s=35.0)
def test_escalation_non_35s_window_passes_vacuously():
# Arrange — 5 s window with no escalation expected.
w = _window(duration_s=5.0)
estimates = [_est(w.onset_monotonic_ms + 100, cov=10.0)]
# Act
report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=False)
# Assert
assert report.passes is True
def test_escalation_35s_no_crossings_passes():
# Arrange — covariance stays below ESCALATION_COV_2D_M for whole window.
w = _make_35s_window()
estimates = [
_est(w.onset_monotonic_ms + i * 1000, cov=10.0 + i * 0.1)
for i in range(30)
]
# Act
report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=True)
# Assert — duration crossing at 30 s alone still triggers AC-7 path; no
# failsafe STATUSTEXT → fails AC-7.
assert report.cov500_or_30s_crossed is True
assert report.passes_ac7 is False
def test_escalation_35s_ac6_fix_type_degraded_passes():
# Arrange — cov crosses 100 m at 5 s; fix_type drops to 2 from then on.
w = _make_35s_window()
estimates = []
for i in range(30):
t = w.onset_monotonic_ms + i * 1000
cov = 50.0 if i < 5 else 150.0
fix = 3 if i < 5 else 2
estimates.append(_est(t, cov=cov, horiz=cov, fix_type=fix))
# Provide failsafe STATUSTEXT at +30 s (within ESCALATION_LATENCY_MS of duration breach).
failsafe_at = w.onset_monotonic_ms + int(ESCALATION_DURATION_FAILSAFE_S * 1000)
statustexts = [
StatustextSample(monotonic_ms=failsafe_at + 100, text=STATUSTEXT_FAILSAFE)
]
# All post-failsafe-trigger samples need horiz_accuracy == 999.
for i in range(30):
if estimates[i].monotonic_ms >= failsafe_at:
estimates[i] = OutboundEstimateSample(
monotonic_ms=estimates[i].monotonic_ms,
source_label=DEAD_RECKONED_LABEL,
cov_semi_major_m=estimates[i].cov_semi_major_m,
horiz_accuracy=HORIZ_ACCURACY_FAILSAFE,
fix_type=2,
)
# Act
report = evaluate_escalation(w, estimates, statustexts, is_35s_window=True)
# Assert
assert report.passes_ac6 is True
assert report.passes_ac7 is True
def test_escalation_35s_ac6_fix_type_not_degraded_fails():
# Arrange — cov crosses 100 m but fix_type stays 3.
w = _make_35s_window()
estimates = []
for i in range(30):
cov = 50.0 if i < 5 else 150.0
estimates.append(_est(w.onset_monotonic_ms + i * 1000, cov=cov, fix_type=3))
# Act
report = evaluate_escalation(w, estimates, statustexts=[], is_35s_window=True)
# Assert
assert report.passes_ac6 is False
def test_escalation_35s_ac7_horiz_not_999_fails():
# Arrange — failsafe trigger reached but horiz_accuracy != 999.
w = _make_35s_window()
estimates = []
for i in range(35):
cov = 50.0 + i * 20.0 # crosses 100 then 500.
estimates.append(_est(w.onset_monotonic_ms + i * 1000, cov=cov, horiz=cov, fix_type=2))
failsafe_at = w.onset_monotonic_ms + int(ESCALATION_DURATION_FAILSAFE_S * 1000)
statustexts = [
StatustextSample(monotonic_ms=failsafe_at + 100, text=STATUSTEXT_FAILSAFE)
]
# Act
report = evaluate_escalation(w, estimates, statustexts, is_35s_window=True)
# Assert
assert report.horiz_accuracy_999 is False
assert report.passes_ac7 is False
# AC-8 recovery gate
def _post_window(w: BlackoutWindow) -> tuple[int, int]:
return w.end_monotonic_ms, w.end_monotonic_ms + int(RECOVERY_STABLE_S * 1000) + 500
def test_recovery_gate_pass():
# Arrange — 12 s of healthy GPS + consistency pass + then satellite_anchored emission.
w = _window()
end, recovery = _post_window(w)
estimates = [
_est(end + 500),
_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL),
]
gps_health = [
GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
for i in range(12)
]
consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
# Act
report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
# Assert
assert report.passes is True
assert report.recovery_at_ms == recovery + 100
def test_recovery_gate_unstable_fails():
# Arrange — GPS goes unhealthy mid-stability window.
w = _window()
end, recovery = _post_window(w)
estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
gps_health = [
GpsHealthSample(monotonic_ms=end + i * 1000, healthy=(i != 5), spoofed=False)
for i in range(12)
]
consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
# Act
report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
# Assert
assert report.passes is False
def test_recovery_gate_spoofed_fails():
# Arrange — GPS healthy but spoofed=True for one sample.
w = _window()
end, recovery = _post_window(w)
estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
gps_health = [
GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=(i == 3))
for i in range(12)
]
consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
# Act
report = evaluate_recovery_gate(w, estimates, gps_health, consistency)
# Assert
assert report.passes is False
def test_recovery_gate_no_consistency_check_fails():
# Arrange
w = _window()
end, recovery = _post_window(w)
estimates = [_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL)]
gps_health = [
GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
for i in range(12)
]
# Act
report = evaluate_recovery_gate(w, estimates, gps_health, consistency_checks=[])
# Assert
assert report.consistency_check_passed is False
assert report.passes is False
def test_recovery_gate_no_recovery_attempt_vacuous_pass():
# Arrange — no satellite_anchored post-window.
w = _window()
estimates = [_est(w.end_monotonic_ms + 500)]
# Act
report = evaluate_recovery_gate(w, estimates, gps_health=[], consistency_checks=[])
# Assert
assert report.recovery_at_ms is None
assert report.passes is True
# Aggregate evaluate + CSV evidence
def _make_passing_5s_inputs() -> dict:
w = _window(duration_s=5.0)
end, recovery = _post_window(w)
estimates = [_est(w.onset_monotonic_ms + 50)]
estimates.extend(
_est(w.onset_monotonic_ms + 100 + i * 100, cov=10.0 + i * 0.1)
for i in range(20)
)
estimates.append(_est(recovery + 100, label=SATELLITE_ANCHORED_LABEL))
statustexts = [
StatustextSample(monotonic_ms=w.onset_monotonic_ms + i * 1000, text=STATUSTEXT_IMU_ONLY)
for i in range(5)
]
spoof_events = [SpoofRejectedEvent(monotonic_ms=w.onset_monotonic_ms + 50, reason="x")]
gps_health = [
GpsHealthSample(monotonic_ms=end + i * 1000, healthy=True, spoofed=False)
for i in range(12)
]
consistency = [ConsistencyCheckEvent(monotonic_ms=end + 5000, passed=True)]
return dict(
window=w,
estimates=estimates,
statustexts=statustexts,
spoof_events=spoof_events,
gps_health=gps_health,
consistency_checks=consistency,
frame_period_ms=100,
is_35s_window=False,
)
def test_evaluate_5s_all_pass():
# Arrange
inputs = _make_passing_5s_inputs()
# Act
report = evaluate(**inputs)
# Assert
assert report.passes is True
def test_write_csv_evidence_round_trips(tmp_path: Path):
# Arrange
inputs = _make_passing_5s_inputs()
report = evaluate(**inputs)
# Act
out = write_csv_evidence(tmp_path / "ft-n-04.csv", report)
# Assert
with out.open() as fh:
rows = list(csv.DictReader(fh))
assert len(rows) == 1
assert rows[0]["passes"] == "true"
assert rows[0]["ac1_passes"] == "true"
assert rows[0]["ac2_passes"] == "true"
@@ -0,0 +1,353 @@
"""Unit tests for `e2e/runner/helpers/outage_request_evaluator.py` (AZ-425)."""
from __future__ import annotations
import csv
from pathlib import Path
from e2e.runner.helpers.outage_request_evaluator import (
DEAD_RECKONED_LABEL,
MIN_OUTAGE_FRAMES,
OUTAGE_THRESHOLD_S,
STATUSTEXT_REGEX,
TOLERANCE_S,
EkfDivergenceEvent,
OutboundEstimateSample,
StatustextSample,
detect_outage_windows,
evaluate,
evaluate_window,
write_csv_evidence,
)
# Constants
def test_constants_match_spec():
# AZ-425: AC-1 ≥3 frames; AC-2 2 s ±500 ms; AC-3 dead_reckoned label.
assert MIN_OUTAGE_FRAMES == 3
assert OUTAGE_THRESHOLD_S == 2.0
assert TOLERANCE_S == 0.5
assert STATUSTEXT_REGEX == "OPERATOR_RELOC_REQUEST"
assert DEAD_RECKONED_LABEL == "dead_reckoned"
# detect_outage_windows
def _est(frame: int, label: str = "satellite_anchored", ms: int = 0) -> OutboundEstimateSample:
return OutboundEstimateSample(
frame_idx=frame,
monotonic_ms=ms if ms else frame * 100,
source_label=label,
)
def test_detect_no_outage_returns_empty():
# Arrange — full frame sequence with all estimates.
expected = list(range(10))
estimates = [_est(i) for i in expected]
# Act
windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
# Assert
assert windows == []
def test_detect_run_below_min_length_ignored():
# Arrange — 2-frame gap is below MIN_OUTAGE_FRAMES=3.
expected = list(range(10))
estimates = [_est(i) for i in expected if i not in (4, 5)]
# Act
windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
# Assert
assert windows == []
def test_detect_single_outage_window():
# Arrange — 3-frame gap at indices 4,5,6.
expected = list(range(10))
estimates = [_est(i) for i in expected if i not in (4, 5, 6)]
# Act
windows = detect_outage_windows(
expected, estimates, frame_period_ms=100, replay_start_monotonic_ms=1000
)
# Assert
assert len(windows) == 1
w = windows[0]
assert w.first_missing_frame_idx == 4
assert w.last_missing_frame_idx == 6
assert w.length_frames == 3
assert w.onset_monotonic_ms == 1000 + 4 * 100 # 1400
assert w.end_monotonic_ms == 1000 + 7 * 100 # 1700
assert w.duration_ms == 300
def test_detect_multiple_windows():
# Arrange — two gaps: 4-6 and 12-15.
expected = list(range(20))
skip = {4, 5, 6, 12, 13, 14, 15}
estimates = [_est(i) for i in expected if i not in skip]
# Act
windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
# Assert
assert len(windows) == 2
assert windows[0].first_missing_frame_idx == 4 and windows[0].length_frames == 3
assert windows[1].first_missing_frame_idx == 12 and windows[1].length_frames == 4
def test_detect_trailing_outage_window():
# Arrange — gap at the end of the sequence.
expected = list(range(10))
estimates = [_est(i) for i in expected if i < 7]
# Act
windows = detect_outage_windows(expected, estimates, frame_period_ms=100)
# Assert
assert len(windows) == 1
assert windows[0].first_missing_frame_idx == 7
assert windows[0].last_missing_frame_idx == 9
# evaluate_window — AC-2 STATUSTEXT timing
def _window_at(onset_ms: int, length: int = 3, period_ms: int = 100):
# Ensure expected sequence is long enough to fully contain the gap + a trailing frame.
total = max(20, length + 5)
expected = list(range(total))
skip = set(range(2, 2 + length))
estimates = [_est(i, ms=i * period_ms) for i in expected if i not in skip]
[w] = detect_outage_windows(
expected,
estimates,
frame_period_ms=period_ms,
replay_start_monotonic_ms=onset_ms - 2 * period_ms,
)
return w, estimates
def test_statustext_within_tolerance_passes():
# Arrange — STATUSTEXT exactly at onset+2 s.
window, estimates = _window_at(onset_ms=10_000, length=30, period_ms=100)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="OPERATOR_RELOC_REQUEST"),
]
# Act
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Assert
assert report.statustext_offset_ms == 2000
assert report.passes_statustext is True
def test_statustext_within_tolerance_late_passes():
# Arrange — STATUSTEXT at onset+2.4 s (within ±500 ms).
window, estimates = _window_at(onset_ms=10_000, length=30)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2400, text="OPERATOR_RELOC_REQUEST"),
]
# Act
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Assert
assert report.passes_statustext is True
def test_statustext_too_early_fails():
# Arrange — STATUSTEXT at onset+1.0 s (before 1.5 s lower bound).
window, estimates = _window_at(onset_ms=10_000, length=30)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 1000, text="OPERATOR_RELOC_REQUEST"),
]
# Act
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Assert
assert report.statustext_offset_ms == 1000
assert report.passes_statustext is False
def test_statustext_too_late_fails():
# Arrange — STATUSTEXT at onset+3.0 s (beyond 2.5 s upper bound).
window, estimates = _window_at(onset_ms=10_000, length=30)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 3000, text="OPERATOR_RELOC_REQUEST"),
]
# Act
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Assert
assert report.passes_statustext is False
def test_statustext_missing_fails():
# Arrange
window, estimates = _window_at(onset_ms=10_000, length=30)
# Act
report = evaluate_window(window, estimates, statustexts=[], ekf_events=[])
# Assert
assert report.statustext_offset_ms is None
assert report.passes_statustext is False
def test_statustext_payload_mismatch_fails():
# Arrange — different STATUSTEXT message at the right time.
window, estimates = _window_at(onset_ms=10_000, length=30)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="EKF_VARIANCE"),
]
# Act
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Assert
assert report.passes_statustext is False
# AC-3 dead_reckoned during outage
def test_dead_reckoned_during_window_passes():
# Arrange — outage 4-6 with dead_reckoned estimate at ms 500 (frame 5 in window).
expected = list(range(20))
skip = {4, 5, 6}
estimates = [
_est(i, ms=i * 100)
for i in expected
if i not in skip
]
# Add dead_reckoned filler emission during the outage window.
estimates.append(
OutboundEstimateSample(frame_idx=4, monotonic_ms=500, source_label=DEAD_RECKONED_LABEL)
)
[w] = detect_outage_windows(expected, [e for e in estimates if e.frame_idx not in {4, 5, 6} or e.source_label == "satellite_anchored"], frame_period_ms=100)
# Note: detection ignores dead_reckoned filler so window still spans 4-6.
# Act
report = evaluate_window(w, estimates, statustexts=[], ekf_events=[])
# Assert — at least one dead_reckoned emission with monotonic_ms in [onset_ms, end_ms].
assert report.dead_reckoned_count >= 1
assert report.passes_dead_reckoned is True
def test_dead_reckoned_absent_fails():
# Arrange
window, estimates = _window_at(onset_ms=10_000, length=3, period_ms=100)
# Act
report = evaluate_window(window, estimates, statustexts=[], ekf_events=[])
# Assert
assert report.dead_reckoned_count == 0
assert report.passes_dead_reckoned is False
# AC-4 EKF divergence
def test_ekf_divergence_during_window_fails():
# Arrange
window, estimates = _window_at(onset_ms=10_000, length=30)
events = [
EkfDivergenceEvent(
monotonic_ms=window.onset_monotonic_ms + 1000, reason="velocity_innov"
)
]
# Act
report = evaluate_window(window, estimates, statustexts=[], ekf_events=events)
# Assert
assert report.ekf_divergence_count == 1
assert report.passes_ekf is False
def test_ekf_divergence_outside_window_ignored():
# Arrange
window, estimates = _window_at(onset_ms=10_000, length=30)
events = [
EkfDivergenceEvent(
monotonic_ms=window.end_monotonic_ms + 1000, reason="velocity_innov"
)
]
# Act
report = evaluate_window(window, estimates, statustexts=[], ekf_events=events)
# Assert
assert report.passes_ekf is True
# evaluate aggregate
def test_evaluate_all_pass():
# Arrange — single outage with everything in order.
expected = list(range(40))
skip = set(range(10, 40))
period_ms = 100
estimates = [
_est(i, ms=i * period_ms)
for i in expected
if i not in skip
]
estimates.append(
OutboundEstimateSample(
frame_idx=10, monotonic_ms=10 * period_ms + 500, source_label=DEAD_RECKONED_LABEL
)
)
statustexts = [
StatustextSample(monotonic_ms=10 * period_ms + 2000, text="OPERATOR_RELOC_REQUEST")
]
# Act
reports = evaluate(
expected,
estimates,
statustexts,
ekf_events=[],
frame_period_ms=period_ms,
)
# Assert
assert len(reports) == 1
assert reports[0].passes is True
# CSV evidence
def test_write_csv_evidence_round_trips(tmp_path: Path):
# Arrange
window, estimates = _window_at(onset_ms=10_000, length=30)
statustexts = [
StatustextSample(monotonic_ms=window.onset_monotonic_ms + 2000, text="OPERATOR_RELOC_REQUEST")
]
report = evaluate_window(window, estimates, statustexts, ekf_events=[])
# Act
out = write_csv_evidence(tmp_path / "ft-n-03.csv", [report])
# Assert
with out.open() as fh:
rows = list(csv.DictReader(fh))
assert len(rows) == 1
assert rows[0]["passes_statustext"] == "true"
assert int(rows[0]["length_frames"]) == 30
@@ -0,0 +1,330 @@
"""Unit tests for `e2e/runner/helpers/outlier_tolerance_evaluator.py` (AZ-424)."""
from __future__ import annotations
import csv
from pathlib import Path
import pytest
from e2e.runner.helpers.outlier_tolerance_evaluator import (
COVARIANCE_WINDOW_FRAMES,
DRIFT_BUDGET_M,
MIN_OUTLIER_COUNT,
GtPose,
OutboundEstimate,
OutlierEvent,
evaluate,
evaluate_event,
load_outlier_manifest,
write_csv_evidence,
)
# Constants
def test_constants_match_spec():
# AC-2 budget + AC-3 window + AC-1 minimum count, per AZ-424.
assert DRIFT_BUDGET_M == 50.0
assert COVARIANCE_WINDOW_FRAMES == 3
assert MIN_OUTLIER_COUNT == 10
# Manifest loading
def _write_manifest(path: Path, rows: list[dict]) -> None:
fieldnames = [
"frame_idx",
"src_jpeg_path",
"replacement_tile_x",
"replacement_tile_y",
"geodesic_offset_m",
"seed",
]
with path.open("w", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=fieldnames)
writer.writeheader()
for r in rows:
row = {k: "" for k in fieldnames}
row.update(r)
writer.writerow(row)
def test_load_outlier_manifest_missing_file_raises(tmp_path: Path):
# Assert
with pytest.raises(FileNotFoundError, match="outlier manifest not found"):
load_outlier_manifest(tmp_path / "nope.csv")
def test_load_outlier_manifest_missing_columns_raises(tmp_path: Path):
# Arrange
p = tmp_path / "manifest.csv"
with p.open("w", newline="") as fh:
writer = csv.DictWriter(fh, fieldnames=["frame_idx", "src_jpeg_path"])
writer.writeheader()
writer.writerow({"frame_idx": "1", "src_jpeg_path": "x.jpg"})
# Assert
with pytest.raises(ValueError, match="missing required columns"):
load_outlier_manifest(p)
def test_load_outlier_manifest_returns_events(tmp_path: Path):
# Arrange
p = tmp_path / "manifest.csv"
_write_manifest(
p,
[
{"frame_idx": "10", "src_jpeg_path": "AD000011.jpg", "geodesic_offset_m": "412.5"},
{"frame_idx": "20", "src_jpeg_path": "AD000021.jpg", "geodesic_offset_m": "381.0"},
],
)
# Act
events = load_outlier_manifest(p)
# Assert
assert len(events) == 2
assert events[0] == OutlierEvent(
frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="AD000011.jpg"
)
assert events[1].frame_idx == 20
# evaluate_event — AC-2 drift bound
def _est(frame: int, lat: float, lon: float, cov: float = 5.0) -> OutboundEstimate:
return OutboundEstimate(
frame_idx=frame,
monotonic_ms=frame * 100,
lat_deg=lat,
lon_deg=lon,
cov_semi_major_m=cov,
source_label="C3_VIO",
)
def _gt(frame: int, lat: float, lon: float) -> GtPose:
return GtPose(frame_idx=frame, lat_deg=lat, lon_deg=lon)
def test_evaluate_event_drift_within_budget():
# Arrange — estimate before/after match GT exactly; outlier frame drifts.
estimates = {
9: _est(9, 50.0000, 30.0000, cov=4.0),
10: _est(10, 50.0050, 30.0050, cov=5.0), # outlier
11: _est(11, 50.0001, 30.0001, cov=5.0),
}
gt = {
9: _gt(9, 50.0000, 30.0000),
10: _gt(10, 50.0001, 30.0001),
11: _gt(11, 50.0002, 30.0002),
}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.frame_idx == 10
assert report.drift_m is not None
assert report.drift_m <= DRIFT_BUDGET_M
assert report.passes_drift is True
def test_evaluate_event_drift_exceeds_budget_fails():
# Arrange — after-frame error is >> before-frame error.
estimates = {
9: _est(9, 50.0000, 30.0000),
10: _est(10, 50.0050, 30.0050),
11: _est(11, 50.0010, 30.0010), # ~129 m off
}
gt = {
9: _gt(9, 50.0000, 30.0000),
10: _gt(10, 50.0001, 30.0001),
11: _gt(11, 50.0000, 30.0000),
}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.drift_m is not None and report.drift_m > DRIFT_BUDGET_M
assert report.passes_drift is False
assert report.passes is False
def test_evaluate_event_missing_neighbour_drift_none():
# Arrange — only outlier frame present.
estimates = {10: _est(10, 50.0050, 30.0050)}
gt = {10: _gt(10, 50.0001, 30.0001)}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.drift_m is None
assert report.passes_drift is False
# evaluate_event — AC-3 covariance monotonic
def test_evaluate_event_cov_monotonic_passes():
# Arrange
estimates = {
9: _est(9, 50.0, 30.0, cov=4.0),
10: _est(10, 50.0, 30.0, cov=5.0),
11: _est(11, 50.0, 30.0, cov=5.5),
}
gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.cov_non_decreasing is True
assert report.passes_covariance is True
def test_evaluate_event_cov_decreasing_fails():
# Arrange — outlier frame cov is lower than before frame.
estimates = {
9: _est(9, 50.0, 30.0, cov=5.0),
10: _est(10, 50.0, 30.0, cov=4.0), # decrease — violates AC-3
11: _est(11, 50.0, 30.0, cov=5.0),
}
gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.cov_non_decreasing is False
assert report.passes_covariance is False
def test_evaluate_event_cov_flat_window_passes():
# Arrange — equal covariances satisfy non-decreasing.
estimates = {
9: _est(9, 50.0, 30.0, cov=5.0),
10: _est(10, 50.0, 30.0, cov=5.0),
11: _est(11, 50.0, 30.0, cov=5.0),
}
gt = {f: _gt(f, 50.0, 30.0) for f in (9, 10, 11)}
event = OutlierEvent(frame_idx=10, geodesic_offset_m=400.0, src_jpeg_path="x.jpg")
# Act
report = evaluate_event(event, estimates, gt)
# Assert
assert report.cov_non_decreasing is True
# Aggregate evaluate — AC-1 minimum count
def test_evaluate_count_below_minimum_fails():
# Arrange — only 5 outliers; AC-1 requires ≥10.
events = [
OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
for i in range(1, 6)
]
estimates: list[OutboundEstimate] = []
gt: list[GtPose] = []
for ev in events:
for delta in (-1, 0, 1):
estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
# Act
report = evaluate(events, estimates, gt)
# Assert
assert report.total_outliers == 5
assert report.passes_count is False
assert report.passes is False
def test_evaluate_count_at_minimum_passes_count_gate():
# Arrange — exactly 10 outliers with non-violating drift/cov.
events = [
OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
for i in range(1, 11)
]
estimates: list[OutboundEstimate] = []
gt: list[GtPose] = []
for ev in events:
for delta in (-1, 0, 1):
estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
# Act
report = evaluate(events, estimates, gt)
# Assert
assert report.total_outliers == 10
assert report.passes_count is True
assert report.failed_event_count == 0
assert report.passes is True
def test_evaluate_mixed_pass_fail_aggregates_correctly():
# Arrange — 10 events, one with drift violation.
events = [
OutlierEvent(frame_idx=i * 10, geodesic_offset_m=400.0, src_jpeg_path=f"x{i}.jpg")
for i in range(1, 11)
]
estimates: list[OutboundEstimate] = []
gt: list[GtPose] = []
for ev in events:
for delta in (-1, 0, 1):
estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
# Override frame 31 to be 200 m off — produces drift > 50 m for event at frame_idx=30.
estimates = [e for e in estimates if e.frame_idx != 31]
estimates.append(_est(31, 50.0018, 30.0, cov=5.0)) # ≈200 m off
# Act
report = evaluate(events, estimates, gt)
# Assert
assert report.total_outliers == 10
assert report.failed_event_count == 1
assert report.passes is False
# CSV evidence writer
def test_write_csv_evidence_round_trips(tmp_path: Path):
# Arrange
events = [
OutlierEvent(frame_idx=10, geodesic_offset_m=412.5, src_jpeg_path="AD000011.jpg"),
OutlierEvent(frame_idx=20, geodesic_offset_m=381.0, src_jpeg_path="AD000021.jpg"),
]
estimates: list[OutboundEstimate] = []
gt: list[GtPose] = []
for ev in events:
for delta in (-1, 0, 1):
estimates.append(_est(ev.frame_idx + delta, 50.0, 30.0, cov=5.0))
gt.append(_gt(ev.frame_idx + delta, 50.0, 30.0))
report = evaluate(events, estimates, gt)
# Act
out = write_csv_evidence(tmp_path / "ft_n_01_evidence.csv", report)
# Assert
assert out.exists()
with out.open() as fh:
rows = list(csv.DictReader(fh))
assert [int(r["frame_idx"]) for r in rows] == [10, 20]
assert all(r["passes"] == "true" for r in rows)
assert all(r["cov_non_decreasing"] == "true" for r in rows)
+6
View File
@@ -52,6 +52,9 @@ E2E_ROOT = Path(__file__).resolve().parents[1]
"runner/helpers/msp_frame_observer.py",
"runner/helpers/ap_contract_evaluator.py",
"runner/helpers/cold_start_evaluator.py",
"runner/helpers/outlier_tolerance_evaluator.py",
"runner/helpers/outage_request_evaluator.py",
"runner/helpers/blackout_spoof_evaluator.py",
"fixtures/mock-suite-sat/Dockerfile",
"fixtures/mock-suite-sat/app.py",
"fixtures/mock-suite-sat/requirements.txt",
@@ -96,7 +99,10 @@ E2E_ROOT = Path(__file__).resolve().parents[1]
"tests/positive/test_ft_p_09_inav.py",
"tests/positive/test_ft_p_10_smoothing_lookback.py",
"tests/positive/test_ft_p_11_cold_start_init.py",
"tests/negative/test_ft_n_01_outlier_tolerance.py",
"tests/negative/test_ft_n_02_sharp_turn_failure.py",
"tests/negative/test_ft_n_03_outage_reloc.py",
"tests/negative/test_ft_n_04_blackout_spoof.py",
],
)
def test_required_path_exists(relative_path: str) -> None: