[AZ-436] [AZ-437] [AZ-438] [AZ-439] Add NFT-SEC-01..05 security scenarios

Batch 87: 6 NFT-SEC blackbox scenarios + 5 helper evaluators + 75 unit tests + cumulative review batches 85-87. * AZ-436 NFT-SEC-01: cache-poisoning safety budget (AC-NEW-9); aggregate false_trust_count ≤ N×1e-6; zero-tolerance default. Canonical-only by default; E2E_NFT_SEC_01_RELEASE_GATE=1 unlocks full matrix. * AZ-437 NFT-SEC-02 + NFT-SEC-05: shared egress-observation evaluator (AC-NEW-10); SEC-02 = 0 packets to non-e2e-net over 5min replay; SEC-05 = DNS-blackhole sidecar healthy + lookup fails + UDP-53 silent. * AZ-438 NFT-SEC-03: AP-only signing rejection (AC-NEW-11); 3 sub-cases (unsigned/wrong-key/replayed) each reject ≤500ms + no position drift. * AZ-439 NFT-SEC-04: probe (always-run) = no-crash + deterministic decode outcome; ASan-fuzz (release-gate) = 0 findings ≥4h; AC-3 corpus floor informational only per spec. Verdict per-batch: PASS_WITH_WARNINGS (5 Low). Cumulative review for batches 85-87 (K=3 window) also PASS_WITH_WARNINGS with 5 cross-batch findings — recommends hygiene PBIs for write_csv_evidence duplication (13 helpers) and _resolve_fixture_path duplication (13 scenarios), plus new tickets for AZ-595 fixture builder + DNS-blackhole sidecar service. Also adds _docs/LESSONS.md documenting the Jira transition-ID lesson (always call getTransitionsForJiraIssue first, never memorize numeric IDs across sessions). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 19:51:13 +00:00 · 2026-05-17 17:33:22 +03:00
parent de19e716d8
commit c56d4584e6
21 changed files with 3510 additions and 0 deletions
@@ -0,0 +1,176 @@
+"""Unit tests for ``runner.helpers.asan_fuzz_evaluator`` (NFT-SEC-04 / AZ-439 fuzz)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import asan_fuzz_evaluator as afe
+
+
+@pytest.mark.parametrize(
+    "line,expected",
+    [
+        (
+            "=================================================================\n",
+            None,
+        ),
+        ("==1234==ERROR: AddressSanitizer: heap-buffer-overflow on address ...", afe.AsanFindingCategory.HEAP_BUFFER_OVERFLOW),
+        ("==1234==ERROR: AddressSanitizer: heap-use-after-free on address ...", afe.AsanFindingCategory.HEAP_USE_AFTER_FREE),
+        ("==1234==ERROR: AddressSanitizer: stack-buffer-overflow on address ...", afe.AsanFindingCategory.STACK_BUFFER_OVERFLOW),
+        ("==1234==ERROR: AddressSanitizer: stack-use-after-return on ...", afe.AsanFindingCategory.STACK_USE_AFTER_RETURN),
+        ("==1234==ERROR: AddressSanitizer: global-buffer-overflow on ...", afe.AsanFindingCategory.GLOBAL_BUFFER_OVERFLOW),
+        ("==1234==ERROR: AddressSanitizer: use-after-free on ...", afe.AsanFindingCategory.USE_AFTER_FREE),
+        ("==1234==ERROR: AddressSanitizer: double-free on ...", afe.AsanFindingCategory.DOUBLE_FREE),
+        # A new ASan category we haven't catalogued yet — must still
+        # fail the test by classifying as OTHER_FINDING.
+        (
+            "==1234==ERROR: AddressSanitizer: mysterious-future-category on ...",
+            afe.AsanFindingCategory.OTHER_FINDING,
+        ),
+        ("just a normal log line, harmless", None),
+        ("ERROR but no AddressSanitizer prefix", None),
+    ],
+)
+def test_classify_asan_line(
+    line: str, expected: afe.AsanFindingCategory | None
+) -> None:
+    assert afe.classify_asan_line(line) == expected
+
+
+def test_zero_findings_and_full_duration_passes() -> None:
+    report = afe.evaluate(
+        ["info line", "another info line"],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert report.passes_findings
+    assert report.passes_duration
+    assert report.passes
+
+
+def test_short_duration_fails_even_with_zero_findings() -> None:
+    report = afe.evaluate(
+        [],
+        duration_seconds=60,  # 1 minute
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert report.passes_findings
+    assert not report.passes_duration
+    assert not report.passes
+
+
+def test_one_finding_fails_full_run() -> None:
+    report = afe.evaluate(
+        ["==1==ERROR: AddressSanitizer: heap-buffer-overflow on ..."],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert len(report.findings) == 1
+    assert report.findings[0].category is afe.AsanFindingCategory.HEAP_BUFFER_OVERFLOW
+    assert not report.passes_findings
+    assert not report.passes
+
+
+def test_unknown_asan_finding_still_fails() -> None:
+    report = afe.evaluate(
+        ["==1==ERROR: AddressSanitizer: brand-new-category"],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert report.findings[0].category is afe.AsanFindingCategory.OTHER_FINDING
+    assert not report.passes
+
+
+def test_corpus_floor_is_informational_only() -> None:
+    # 0 findings + full duration but well below corpus floor → still passes.
+    report = afe.evaluate(
+        [],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=10,
+    )
+
+    assert not report.reached_corpus_floor
+    assert report.passes
+
+
+def test_snippet_is_truncated_for_evidence() -> None:
+    huge = "==1==ERROR: AddressSanitizer: heap-buffer-overflow " + "x" * 500
+    report = afe.evaluate(
+        [huge],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert len(report.findings[0].snippet) <= 200
+
+
+def test_multiple_findings_classified_and_counted() -> None:
+    log_lines = [
+        "info",
+        "==1==ERROR: AddressSanitizer: heap-buffer-overflow",
+        "info",
+        "==2==ERROR: AddressSanitizer: heap-buffer-overflow",
+        "==3==ERROR: AddressSanitizer: use-after-free",
+        "trailing log",
+    ]
+
+    report = afe.evaluate(
+        log_lines,
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    assert len(report.findings) == 3
+    categories = [f.category for f in report.findings]
+    assert categories.count(afe.AsanFindingCategory.HEAP_BUFFER_OVERFLOW) == 2
+    assert categories.count(afe.AsanFindingCategory.USE_AFTER_FREE) == 1
+    assert not report.passes
+
+
+def test_csv_evidence_round_trip_no_findings(tmp_path: Path) -> None:
+    report = afe.evaluate(
+        [],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    out = tmp_path / "out.csv"
+    afe.write_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "duration_seconds"
+    assert rows[1][5] == "true"  # passes_findings
+    assert rows[1][6] == "true"  # passes
+    # No per-finding section because no findings were recorded.
+    assert len(rows) == 2
+
+
+def test_csv_evidence_round_trip_with_findings(tmp_path: Path) -> None:
+    report = afe.evaluate(
+        [
+            "==1==ERROR: AddressSanitizer: heap-buffer-overflow",
+            "==2==ERROR: AddressSanitizer: use-after-free",
+        ],
+        duration_seconds=afe.MIN_FUZZ_DURATION_SECONDS,
+        corpus_size=afe.MIN_CORPUS_COVERAGE,
+    )
+
+    out = tmp_path / "out.csv"
+    afe.write_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "duration_seconds"
+    assert rows[1][6] == "false"  # passes
+    # Aggregate row + blank + sub-header + 2 finding rows = 5 rows total.
+    assert rows[2] == []
+    assert rows[3] == ["finding_index", "category", "snippet"]
+    assert len(rows) == 6
@@ -0,0 +1,245 @@
+"""Unit tests for ``runner.helpers.cache_poisoning_evaluator`` (NFT-SEC-01 / AZ-436)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import cache_poisoning_evaluator as cpe
+
+
+def _spec(tile_id: str, layer: str) -> cpe.PoisonedTileSpec:
+    return cpe.PoisonedTileSpec(tile_id=tile_id, defense_layer=layer)
+
+
+def _flight(
+    flight_id: str,
+    *,
+    total: int = 1000,
+    poisoned: tuple[cpe.PoisonedTileSpec, ...] | None = None,
+    false_trust: tuple[cpe.FalseTrustEvent, ...] | None = None,
+    rejection_reasons: dict[str, int] | None = None,
+) -> cpe.FlightOutcome:
+    if poisoned is None:
+        poisoned = (
+            _spec("t1", cpe.DEFENSE_LAYER_SIGNING),
+            _spec("t2", cpe.DEFENSE_LAYER_FRESHNESS),
+            _spec("t3", cpe.DEFENSE_LAYER_VOTING),
+        )
+    return cpe.FlightOutcome(
+        flight_id=flight_id,
+        total_tile_count=total,
+        poisoned_tiles=poisoned,
+        false_trust_events=false_trust or (),
+        rejection_reasons=rejection_reasons or {},
+    )
+
+
+def test_poison_ratio_within_band_passes_ratio_check() -> None:
+    # 3 poisoned / 100 total = 3 % — inside [1 %, 5 %].
+    flight = _flight("f1", total=100)
+
+    report = cpe.evaluate([flight])
+
+    assert report.passes_ratio
+    assert flight.poison_ratio == pytest.approx(0.03)
+
+
+def test_poison_ratio_below_min_fails_ratio_check() -> None:
+    # 3 / 1000 = 0.3 % — below the 1 % floor.
+    flight = _flight("f-low", total=1000)
+
+    report = cpe.evaluate([flight])
+
+    assert not report.passes_ratio
+    assert "f-low" in report.flights_with_bad_poison_ratio
+
+
+def test_poison_ratio_above_max_fails_ratio_check() -> None:
+    # 3 / 50 = 6 % — above the 5 % ceiling.
+    flight = _flight("f-high", total=50)
+
+    report = cpe.evaluate([flight])
+
+    assert not report.passes_ratio
+    assert "f-high" in report.flights_with_bad_poison_ratio
+
+
+def test_zero_total_tile_count_gives_zero_ratio() -> None:
+    flight = _flight("f-empty", total=0, poisoned=())
+
+    report = cpe.evaluate([flight])
+
+    assert flight.poison_ratio == 0.0
+    assert "f-empty" in report.flights_with_bad_poison_ratio
+
+
+def test_missing_defense_layer_fails_layer_coverage() -> None:
+    # Only signing + freshness; voting layer missing.
+    flight = _flight(
+        "f-missing",
+        total=100,
+        poisoned=(
+            _spec("t1", cpe.DEFENSE_LAYER_SIGNING),
+            _spec("t2", cpe.DEFENSE_LAYER_FRESHNESS),
+        ),
+    )
+
+    report = cpe.evaluate([flight])
+
+    assert not report.passes_layer_coverage
+    [(flight_id, missing)] = report.flights_missing_defense_layers
+    assert flight_id == "f-missing"
+    assert missing == [cpe.DEFENSE_LAYER_VOTING]
+
+
+def test_all_three_defense_layers_pass_layer_coverage() -> None:
+    flight = _flight("f-complete", total=100)
+
+    report = cpe.evaluate([flight])
+
+    assert report.passes_layer_coverage
+
+
+def test_zero_false_trust_events_passes_budget() -> None:
+    flight = _flight("f", total=100)
+
+    report = cpe.evaluate([flight])
+
+    assert report.total_false_trust == 0
+    assert report.passes_budget
+
+
+def test_single_false_trust_event_fails_budget() -> None:
+    flight = _flight(
+        "f",
+        total=100,
+        false_trust=(
+            cpe.FalseTrustEvent(
+                flight_id="f",
+                tile_id="t1",
+                monotonic_ms=12345,
+                defense_layer=cpe.DEFENSE_LAYER_SIGNING,
+            ),
+        ),
+    )
+
+    report = cpe.evaluate([flight])
+
+    assert report.total_false_trust == 1
+    assert not report.passes_budget
+    # Zero-tolerance default — even though 1 event is technically within
+    # the 0.01 budget at N=10000, the helper must reject it.
+
+
+def test_unknown_rejection_reason_fails_vocabulary_check() -> None:
+    flight = _flight(
+        "f-vocab",
+        total=100,
+        rejection_reasons={"made_up_reason": 7},
+    )
+
+    report = cpe.evaluate([flight])
+
+    assert not report.passes_rejection_reason_vocabulary
+    assert "f-vocab" in report.flights_with_unknown_rejection_reasons
+
+
+def test_known_rejection_reasons_pass_vocabulary_check() -> None:
+    flight = _flight(
+        "f-ok",
+        total=100,
+        rejection_reasons={
+            cpe.DEFENSE_LAYER_SIGNING: 12,
+            cpe.DEFENSE_LAYER_VOTING: 8,
+            "freshness_gate_downgrade": 3,
+        },
+    )
+
+    report = cpe.evaluate([flight])
+
+    assert report.passes_rejection_reason_vocabulary
+
+
+def test_aggregate_budget_scales_with_flight_count() -> None:
+    flights = [_flight(f"f{i}", total=100) for i in range(50)]
+
+    report = cpe.evaluate(flights)
+
+    assert report.flight_count == 50
+    assert report.budget == pytest.approx(50 * 1e-6)
+
+
+def test_aggregate_counts_false_trust_across_flights() -> None:
+    flights = [
+        _flight(
+            f"f{i}",
+            total=100,
+            false_trust=(
+                cpe.FalseTrustEvent(
+                    flight_id=f"f{i}",
+                    tile_id="tx",
+                    monotonic_ms=100 * i,
+                    defense_layer=cpe.DEFENSE_LAYER_SIGNING,
+                ),
+            )
+            if i % 2 == 0
+            else (),
+        )
+        for i in range(4)
+    ]
+
+    report = cpe.evaluate(flights)
+
+    assert report.total_false_trust == 2  # f0 and f2 each had one event
+
+
+def test_overall_pass_requires_all_subchecks() -> None:
+    flight = _flight("f", total=100)
+
+    report = cpe.evaluate([flight])
+
+    assert report.passes
+    assert report.passes_budget
+    assert report.passes_ratio
+    assert report.passes_layer_coverage
+    assert report.passes_rejection_reason_vocabulary
+
+
+def test_overall_pass_fails_if_any_subcheck_fails() -> None:
+    flight = _flight(
+        "f-broken",
+        total=1000,  # 3/1000 = 0.3 % — bad ratio
+    )
+
+    report = cpe.evaluate([flight])
+
+    assert not report.passes
+
+
+def test_empty_flight_list_gives_trivial_pass() -> None:
+    # Treat an empty run as a no-op rather than an implicit failure;
+    # the scenario test is responsible for asserting N >= 1.
+    report = cpe.evaluate([])
+
+    assert report.flight_count == 0
+    assert report.total_false_trust == 0
+    assert report.passes
+
+
+def test_csv_evidence_has_header_per_flight_and_aggregate_rows(tmp_path: Path) -> None:
+    flights = [_flight(f"f{i}", total=100) for i in range(3)]
+    report = cpe.evaluate(flights)
+
+    out = tmp_path / "out.csv"
+    cpe.write_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "flight_id"
+    assert {rows[1][0], rows[2][0], rows[3][0]} == {"f0", "f1", "f2"}
+    assert rows[4] == []
+    assert rows[5][0] == "AGGREGATE"
+    assert any("flight_count=3" in cell for cell in rows[5])
@@ -0,0 +1,120 @@
+"""Unit tests for ``runner.helpers.cve_probe_evaluator`` (NFT-SEC-04 / AZ-439 probe)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+from runner.helpers import cve_probe_evaluator as cpe
+
+
+def _rec(ms: int, kind: str) -> cpe.FdrSurvivalRecord:
+    return cpe.FdrSurvivalRecord(monotonic_ms=ms, kind=kind)
+
+
+def test_decode_success_in_window_is_classified_as_decode_success() -> None:
+    outcome = cpe.classify_probe_outcome(
+        [_rec(10_005, "frame-decode-success")],
+        probe_injected_at_ms=10_000,
+    )
+    assert outcome is cpe.ProbeFrameOutcome.DECODE_SUCCESS
+
+
+def test_decode_error_in_window_is_classified_as_frame_decode_error() -> None:
+    outcome = cpe.classify_probe_outcome(
+        [_rec(10_010, "frame-decode-error")],
+        probe_injected_at_ms=10_000,
+    )
+    assert outcome is cpe.ProbeFrameOutcome.FRAME_DECODE_ERROR
+
+
+def test_no_record_in_window_is_missing() -> None:
+    outcome = cpe.classify_probe_outcome(
+        [_rec(9_900, "frame-decode-success")],
+        probe_injected_at_ms=10_000,
+    )
+    assert outcome is cpe.ProbeFrameOutcome.MISSING
+
+
+def test_record_outside_tolerance_is_missing() -> None:
+    outcome = cpe.classify_probe_outcome(
+        [_rec(10_100, "frame-decode-success")],
+        probe_injected_at_ms=10_000,
+        tolerance_ms=50,
+    )
+    assert outcome is cpe.ProbeFrameOutcome.MISSING
+
+
+def test_first_match_in_window_wins() -> None:
+    outcome = cpe.classify_probe_outcome(
+        [
+            _rec(10_005, "frame-decode-success"),
+            _rec(10_010, "frame-decode-error"),
+        ],
+        probe_injected_at_ms=10_000,
+    )
+    assert outcome is cpe.ProbeFrameOutcome.DECODE_SUCCESS
+
+
+def test_passes_when_no_crash_and_decode_success() -> None:
+    report = cpe.evaluate(
+        [_rec(10_005, "frame-decode-success"), _rec(11_000, "imu-tick")],
+        probe_injected_at_ms=10_000,
+    )
+    assert report.passes_no_crash
+    assert report.passes_graceful_outcome
+    assert report.passes
+
+
+def test_passes_when_no_crash_and_graceful_error() -> None:
+    report = cpe.evaluate(
+        [_rec(10_005, "frame-decode-error"), _rec(11_000, "imu-tick")],
+        probe_injected_at_ms=10_000,
+    )
+    assert report.passes
+
+
+def test_fails_when_no_post_probe_fdr_record() -> None:
+    # All FDR records are BEFORE the probe — the SUT crashed at probe
+    # time. AC-1a fails.
+    report = cpe.evaluate(
+        [_rec(9_500, "imu-tick"), _rec(9_900, "frame-decode-success")],
+        probe_injected_at_ms=10_000,
+    )
+    assert not report.passes_no_crash
+    assert not report.passes
+
+
+def test_fails_when_silent_drop() -> None:
+    # SUT is alive (post-probe records exist) but no decode record at
+    # all — the probe frame was silently swallowed. AC-1b fails.
+    report = cpe.evaluate(
+        [_rec(11_000, "imu-tick"), _rec(12_000, "imu-tick")],
+        probe_injected_at_ms=10_000,
+    )
+    assert report.passes_no_crash
+    assert not report.passes_graceful_outcome
+    assert not report.passes
+
+
+def test_empty_fdr_archive_fails_both_subchecks() -> None:
+    report = cpe.evaluate([], probe_injected_at_ms=10_000)
+    assert not report.passes_no_crash
+    assert not report.passes_graceful_outcome
+    assert not report.passes
+
+
+def test_csv_evidence_round_trip(tmp_path: Path) -> None:
+    report = cpe.evaluate(
+        [_rec(10_005, "frame-decode-success")],
+        probe_injected_at_ms=10_000,
+    )
+
+    out = tmp_path / "out.csv"
+    cpe.write_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "probe_injected_at_ms"
+    assert rows[1][2] == "decode-success"
+    assert rows[1][-1] == "true"
@@ -0,0 +1,168 @@
+"""Unit tests for ``runner.helpers.egress_observer`` (NFT-SEC-02 + NFT-SEC-05 / AZ-437)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import egress_observer as eo
+
+
+def _snap(other: int = 0, internal: int = 0, udp53: int = 0) -> eo.EgressCounterSnapshot:
+    return eo.EgressCounterSnapshot(
+        egress_packets_to_internal_net=internal,
+        egress_packets_to_other_destinations=other,
+        udp53_egress_packets=udp53,
+    )
+
+
+def test_egress_counter_rejects_negative_values() -> None:
+    with pytest.raises(ValueError, match="cannot be negative"):
+        eo.EgressCounterSnapshot(
+            egress_packets_to_internal_net=-1,
+            egress_packets_to_other_destinations=0,
+            udp53_egress_packets=0,
+        )
+
+
+def test_no_egress_zero_delta_passes() -> None:
+    before = _snap(other=10, internal=5)
+    after = _snap(other=10, internal=42)  # internal traffic grew; that's fine
+
+    report = eo.evaluate_no_egress(before, after, window_label="5min")
+
+    assert report.delta_other_destinations == 0
+    assert report.passes
+
+
+def test_no_egress_nonzero_delta_fails() -> None:
+    before = _snap(other=10)
+    after = _snap(other=11)
+
+    report = eo.evaluate_no_egress(before, after, window_label="5min")
+
+    assert report.delta_other_destinations == 1
+    assert not report.passes
+
+
+def test_no_egress_records_internal_delta_for_evidence() -> None:
+    before = _snap(internal=100)
+    after = _snap(internal=200)
+
+    report = eo.evaluate_no_egress(before, after, window_label="5min-derkachi")
+
+    assert report.delta_internal == 100  # informational; does not affect verdict
+    assert report.passes
+
+
+def test_dns_blackhole_passes_on_full_silence_and_failed_lookup() -> None:
+    before = _snap(udp53=7)
+    after = _snap(udp53=7)
+
+    report = eo.evaluate_dns_blackhole(
+        before,
+        after,
+        lookup_outcome=eo.DnsLookupOutcome.NXDOMAIN,
+        sidecar_healthy=True,
+    )
+
+    assert report.passes
+
+
+def test_dns_blackhole_fails_on_successful_lookup() -> None:
+    before = _snap(udp53=7)
+    after = _snap(udp53=7)
+
+    report = eo.evaluate_dns_blackhole(
+        before,
+        after,
+        lookup_outcome=eo.DnsLookupOutcome.SUCCESS,
+        sidecar_healthy=True,
+    )
+
+    assert not report.passes_lookup
+    assert not report.passes
+
+
+def test_dns_blackhole_fails_when_udp53_packets_escaped() -> None:
+    before = _snap(udp53=7)
+    after = _snap(udp53=8)
+
+    report = eo.evaluate_dns_blackhole(
+        before,
+        after,
+        lookup_outcome=eo.DnsLookupOutcome.NXDOMAIN,
+        sidecar_healthy=True,
+    )
+
+    assert not report.passes_udp_silence
+    assert not report.passes
+
+
+def test_dns_blackhole_fails_when_sidecar_unhealthy() -> None:
+    before = _snap()
+    after = _snap()
+
+    report = eo.evaluate_dns_blackhole(
+        before,
+        after,
+        lookup_outcome=eo.DnsLookupOutcome.NXDOMAIN,
+        sidecar_healthy=False,
+    )
+
+    assert not report.passes
+
+
+@pytest.mark.parametrize(
+    "outcome",
+    [
+        eo.DnsLookupOutcome.NXDOMAIN,
+        eo.DnsLookupOutcome.TIMEOUT,
+        eo.DnsLookupOutcome.NO_SERVERS,
+        eo.DnsLookupOutcome.OTHER_FAILURE,
+    ],
+)
+def test_all_failure_outcomes_pass_lookup_check(outcome: eo.DnsLookupOutcome) -> None:
+    report = eo.evaluate_dns_blackhole(
+        _snap(),
+        _snap(),
+        lookup_outcome=outcome,
+        sidecar_healthy=True,
+    )
+
+    assert report.passes_lookup
+
+
+def test_no_egress_csv_evidence_round_trip(tmp_path: Path) -> None:
+    before = _snap(other=0, internal=5)
+    after = _snap(other=0, internal=42)
+    report = eo.evaluate_no_egress(before, after, window_label="5min")
+
+    out = tmp_path / "out.csv"
+    eo.write_no_egress_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "window_label"
+    assert rows[1][0] == "5min"
+    assert rows[1][-1] == "true"
+
+
+def test_dns_blackhole_csv_evidence_round_trip(tmp_path: Path) -> None:
+    report = eo.evaluate_dns_blackhole(
+        _snap(udp53=7),
+        _snap(udp53=7),
+        lookup_outcome=eo.DnsLookupOutcome.NXDOMAIN,
+        sidecar_healthy=True,
+    )
+
+    out = tmp_path / "out.csv"
+    eo.write_dns_blackhole_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "sidecar_healthy"
+    assert rows[1][1] == "nxdomain"
+    assert rows[1][-1] == "true"
@@ -0,0 +1,196 @@
+"""Unit tests for ``runner.helpers.mavlink_signing_evaluator`` (NFT-SEC-03 / AZ-438)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import mavlink_signing_evaluator as mse
+
+
+def _pos(ms: int, lat_e7: int = 0, lon_e7: int = 0) -> mse.PositionSample:
+    return mse.PositionSample(monotonic_ms=ms, lat_e7=lat_e7, lon_e7=lon_e7)
+
+
+def _st(ms: int, text: str) -> mse.StatustextSample:
+    return mse.StatustextSample(monotonic_ms=ms, text=text)
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        ("MAVLink: BAD_SIGNATURE", True),
+        ("BAD SIGNATURE", True),
+        ("Bad signature received from sysid=255", True),
+        ("Signature rejected on link 0", True),
+        ("PreArm: GPS Glitch", False),
+        ("OK", False),
+        ("", False),
+    ],
+)
+def test_is_bad_signature_statustext_matches_documented_variants(
+    text: str, expected: bool
+) -> None:
+    assert mse.is_bad_signature_statustext(text) is expected
+
+
+def test_subcase_passes_when_rejection_arrives_within_budget() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.UNSIGNED, injected_at_ms=10_000
+    )
+    statustexts = [_st(10_300, "MAVLink: BAD_SIGNATURE")]
+    positions = [_pos(9_900), _pos(10_100)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.rejection_latency_ms == 300
+    assert report.passes_rejection
+    assert report.passes
+
+
+def test_subcase_fails_when_no_rejection_seen() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.WRONG_KEY, injected_at_ms=10_000
+    )
+    statustexts = [_st(10_300, "ok normal text")]
+    positions = [_pos(9_900), _pos(10_100)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.rejection_at_ms is None
+    assert not report.passes_rejection
+    assert not report.passes
+
+
+def test_subcase_fails_when_rejection_too_slow() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.REPLAYED, injected_at_ms=10_000
+    )
+    statustexts = [_st(11_000, "MAVLink: BAD_SIGNATURE")]
+    positions = [_pos(9_900), _pos(10_100)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.rejection_latency_ms == 1000
+    assert not report.passes_rejection
+    assert not report.passes
+
+
+def test_subcase_fails_when_position_drifts() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.UNSIGNED, injected_at_ms=10_000
+    )
+    statustexts = [_st(10_200, "MAVLink: BAD_SIGNATURE")]
+    # 0.0001 deg ≈ 11 m of latitude drift — clearly outside the 1 m
+    # tolerance, simulating a successful poison of AP's GPS state.
+    positions = [_pos(9_900, lat_e7=0), _pos(10_100, lat_e7=1_000)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.position_drift_m > mse.POSITION_DRIFT_TOLERANCE_M
+    assert not report.passes_no_position_update
+    assert not report.passes
+
+
+def test_subcase_passes_with_tiny_jitter_within_tolerance() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.UNSIGNED, injected_at_ms=10_000
+    )
+    statustexts = [_st(10_100, "MAVLink: BAD_SIGNATURE")]
+    # 1 e7-lat unit ≈ 1.1 cm — well below the 1 m tolerance.
+    positions = [_pos(9_900, lat_e7=0), _pos(10_100, lat_e7=10)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.position_drift_m < mse.POSITION_DRIFT_TOLERANCE_M
+    assert report.passes
+
+
+def test_position_drift_returns_zero_when_no_pre_injection_sample() -> None:
+    # Only samples after the injection — the helper has no baseline
+    # so it returns 0 (the AC assertion still passes; the test author
+    # is expected to fail the test earlier if positions are missing).
+    drift = mse.position_drift_m([_pos(10_100)], around_ms=10_000)
+
+    assert drift == 0.0
+
+
+def test_position_drift_returns_zero_when_no_post_injection_sample() -> None:
+    drift = mse.position_drift_m([_pos(9_900)], around_ms=10_000)
+
+    assert drift == 0.0
+
+
+def test_rejection_before_injection_is_ignored() -> None:
+    injection = mse.InjectionEvent(
+        sub_case=mse.SubCase.WRONG_KEY, injected_at_ms=10_000
+    )
+    statustexts = [
+        _st(9_500, "MAVLink: BAD_SIGNATURE"),  # earlier — ignored
+        _st(10_400, "ok"),
+    ]
+    positions = [_pos(9_900), _pos(10_100)]
+
+    report = mse.evaluate_subcase(injection, statustexts, positions)
+
+    assert report.rejection_at_ms is None
+    assert not report.passes
+
+
+def test_aggregate_passes_only_if_all_subcases_pass() -> None:
+    injections = [
+        mse.InjectionEvent(mse.SubCase.UNSIGNED, injected_at_ms=10_000),
+        mse.InjectionEvent(mse.SubCase.WRONG_KEY, injected_at_ms=20_000),
+        mse.InjectionEvent(mse.SubCase.REPLAYED, injected_at_ms=30_000),
+    ]
+    statustexts = [
+        _st(10_100, "MAVLink: BAD_SIGNATURE"),
+        _st(20_200, "Signature rejected"),
+        _st(30_300, "Bad signature received"),
+    ]
+    positions = [_pos(9_900), _pos(40_100)]
+
+    report = mse.evaluate(
+        injections, statustexts=statustexts, positions=positions
+    )
+
+    assert report.passes
+
+
+def test_aggregate_fails_when_one_subcase_fails() -> None:
+    injections = [
+        mse.InjectionEvent(mse.SubCase.UNSIGNED, injected_at_ms=10_000),
+        mse.InjectionEvent(mse.SubCase.WRONG_KEY, injected_at_ms=20_000),  # no rejection
+    ]
+    statustexts = [_st(10_100, "MAVLink: BAD_SIGNATURE")]
+    positions = [_pos(9_900), _pos(40_100)]
+
+    report = mse.evaluate(
+        injections, statustexts=statustexts, positions=positions
+    )
+
+    assert not report.passes
+    [unsigned, wrong_key] = report.sub_cases
+    assert unsigned.passes
+    assert not wrong_key.passes
+
+
+def test_csv_evidence_round_trip(tmp_path: Path) -> None:
+    injection = mse.InjectionEvent(mse.SubCase.UNSIGNED, injected_at_ms=10_000)
+    statustexts = [_st(10_200, "MAVLink: BAD_SIGNATURE")]
+    positions = [_pos(9_900), _pos(10_100, lat_e7=10)]
+    report = mse.evaluate(
+        [injection], statustexts=statustexts, positions=positions
+    )
+
+    out = tmp_path / "out.csv"
+    mse.write_csv_evidence(out, report)
+
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "sub_case"
+    assert rows[1][0] == "unsigned"
+    assert rows[1][2] == "200"  # latency
+    assert rows[1][-1] == "true"