[AZ-436] [AZ-437] [AZ-438] [AZ-439] Add NFT-SEC-01..05 security scenarios

Batch 87: 6 NFT-SEC blackbox scenarios + 5 helper evaluators + 75 unit tests + cumulative review batches 85-87. * AZ-436 NFT-SEC-01: cache-poisoning safety budget (AC-NEW-9); aggregate false_trust_count ≤ N×1e-6; zero-tolerance default. Canonical-only by default; E2E_NFT_SEC_01_RELEASE_GATE=1 unlocks full matrix. * AZ-437 NFT-SEC-02 + NFT-SEC-05: shared egress-observation evaluator (AC-NEW-10); SEC-02 = 0 packets to non-e2e-net over 5min replay; SEC-05 = DNS-blackhole sidecar healthy + lookup fails + UDP-53 silent. * AZ-438 NFT-SEC-03: AP-only signing rejection (AC-NEW-11); 3 sub-cases (unsigned/wrong-key/replayed) each reject ≤500ms + no position drift. * AZ-439 NFT-SEC-04: probe (always-run) = no-crash + deterministic decode outcome; ASan-fuzz (release-gate) = 0 findings ≥4h; AC-3 corpus floor informational only per spec. Verdict per-batch: PASS_WITH_WARNINGS (5 Low). Cumulative review for batches 85-87 (K=3 window) also PASS_WITH_WARNINGS with 5 cross-batch findings — recommends hygiene PBIs for write_csv_evidence duplication (13 helpers) and _resolve_fixture_path duplication (13 scenarios), plus new tickets for AZ-595 fixture builder + DNS-blackhole sidecar service. Also adds _docs/LESSONS.md documenting the Jira transition-ID lesson (always call getTransitionsForJiraIssue first, never memorize numeric IDs across sessions). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 18:21:16 +00:00 · 2026-05-17 17:33:22 +03:00
parent de19e716d8
commit c56d4584e6
21 changed files with 3510 additions and 0 deletions
@@ -0,0 +1,169 @@
+"""AddressSanitizer fuzz evaluator for NFT-SEC-04 (AZ-439 / RESTRICT-CVE-1 release-gate).
+
+Companion to ``cve_probe_evaluator``: while the probe asserts a single
+crafted JPEG does not crash the SUT, the fuzz scenario runs the
+``build_kind=ASan`` SUT image under random JPEG inputs for ≥4 h and
+asserts:
+
+* AC-2: 0 ASan findings (``heap-buffer-overflow``, ``use-after-free``,
+  ``stack-buffer-overflow``, ``heap-use-after-free``, etc.) in the
+  captured stderr / ASan log;
+* AC-3 (informational only — no hard threshold): the harness reached
+  ≥``MIN_CORPUS_COVERAGE`` unique JPEG inputs.
+
+ASan-finding categories follow the canonical sanitizer wording. The
+classifier matches a curated, non-exhaustive set; an *unknown* match
+is bucketed into ``OTHER_FINDING`` and still fails AC-2. Unknown
+findings are surfaced in the CSV evidence so a regression triage knows
+to extend the canonical set.
+
+Public-boundary discipline: does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+import re
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Sequence
+
+MIN_FUZZ_DURATION_SECONDS = 4 * 3600  # AC-2 — release-gate minimum
+MIN_CORPUS_COVERAGE = 1000  # AC-3 — informational only
+
+
+class AsanFindingCategory(str, Enum):
+    HEAP_BUFFER_OVERFLOW = "heap-buffer-overflow"
+    HEAP_USE_AFTER_FREE = "heap-use-after-free"
+    STACK_BUFFER_OVERFLOW = "stack-buffer-overflow"
+    STACK_USE_AFTER_RETURN = "stack-use-after-return"
+    GLOBAL_BUFFER_OVERFLOW = "global-buffer-overflow"
+    USE_AFTER_FREE = "use-after-free"
+    DOUBLE_FREE = "double-free"
+    OTHER_FINDING = "other-finding"  # canonical unknown ASan match
+
+
+# Each entry is (regex, category). Matched in order — first hit wins.
+_KNOWN_PATTERNS: tuple[tuple[str, AsanFindingCategory], ...] = (
+    (r"ERROR: AddressSanitizer:\s*heap-buffer-overflow", AsanFindingCategory.HEAP_BUFFER_OVERFLOW),
+    (r"ERROR: AddressSanitizer:\s*heap-use-after-free", AsanFindingCategory.HEAP_USE_AFTER_FREE),
+    (r"ERROR: AddressSanitizer:\s*stack-buffer-overflow", AsanFindingCategory.STACK_BUFFER_OVERFLOW),
+    (r"ERROR: AddressSanitizer:\s*stack-use-after-return", AsanFindingCategory.STACK_USE_AFTER_RETURN),
+    (r"ERROR: AddressSanitizer:\s*global-buffer-overflow", AsanFindingCategory.GLOBAL_BUFFER_OVERFLOW),
+    (r"ERROR: AddressSanitizer:\s*use-after-free", AsanFindingCategory.USE_AFTER_FREE),
+    (r"ERROR: AddressSanitizer:\s*double-free", AsanFindingCategory.DOUBLE_FREE),
+)
+_KNOWN_COMPILED = tuple((re.compile(pat), cat) for pat, cat in _KNOWN_PATTERNS)
+_ANY_ASAN_RE = re.compile(r"ERROR: AddressSanitizer:")
+
+
+def classify_asan_line(line: str) -> AsanFindingCategory | None:
+    """Classify one stderr line. Returns ``None`` if it's not an ASan finding."""
+    for regex, category in _KNOWN_COMPILED:
+        if regex.search(line):
+            return category
+    if _ANY_ASAN_RE.search(line):
+        return AsanFindingCategory.OTHER_FINDING
+    return None
+
+
+@dataclass(frozen=True)
+class AsanFinding:
+    """One classified finding (one line OR one synthesized event)."""
+
+    category: AsanFindingCategory
+    snippet: str  # the matched line; truncated to ≤200 chars in evidence
+
+
+@dataclass(frozen=True)
+class AsanFuzzReport:
+    """Aggregate verdict for one ≥4 h fuzz run."""
+
+    duration_seconds: float
+    corpus_size: int
+    findings: Sequence[AsanFinding]
+
+    @property
+    def passes_duration(self) -> bool:
+        return self.duration_seconds >= MIN_FUZZ_DURATION_SECONDS
+
+    @property
+    def passes_findings(self) -> bool:
+        return len(self.findings) == 0
+
+    @property
+    def reached_corpus_floor(self) -> bool:
+        # Informational only — does NOT contribute to ``passes``.
+        return self.corpus_size >= MIN_CORPUS_COVERAGE
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_duration and self.passes_findings
+
+
+def evaluate(
+    asan_log_lines: Sequence[str],
+    *,
+    duration_seconds: float,
+    corpus_size: int,
+) -> AsanFuzzReport:
+    """Scan the ASan log, classify findings, and assemble the report."""
+    findings: list[AsanFinding] = []
+    for line in asan_log_lines:
+        category = classify_asan_line(line)
+        if category is not None:
+            findings.append(
+                AsanFinding(
+                    category=category,
+                    snippet=line.strip()[:200],
+                )
+            )
+    return AsanFuzzReport(
+        duration_seconds=duration_seconds,
+        corpus_size=corpus_size,
+        findings=tuple(findings),
+    )
+
+
+def write_csv_evidence(out_path: Path, report: AsanFuzzReport) -> Path:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "duration_seconds",
+                "passes_duration",
+                "corpus_size",
+                "reached_corpus_floor",
+                "finding_count",
+                "passes_findings",
+                "passes",
+                "finding_breakdown",
+            ]
+        )
+        breakdown: dict[str, int] = {}
+        for f in report.findings:
+            breakdown[f.category.value] = breakdown.get(f.category.value, 0) + 1
+        breakdown_str = ";".join(
+            f"{cat}={count}" for cat, count in sorted(breakdown.items())
+        )
+        writer.writerow(
+            [
+                f"{report.duration_seconds:.0f}",
+                "true" if report.passes_duration else "false",
+                report.corpus_size,
+                "true" if report.reached_corpus_floor else "false",
+                len(report.findings),
+                "true" if report.passes_findings else "false",
+                "true" if report.passes else "false",
+                breakdown_str,
+            ]
+        )
+        if report.findings:
+            writer.writerow([])
+            writer.writerow(["finding_index", "category", "snippet"])
+            for idx, f in enumerate(report.findings):
+                writer.writerow([idx, f.category.value, f.snippet])
+    return out_path
@@ -0,0 +1,238 @@
+"""Cache-poisoning safety-budget evaluator for NFT-SEC-01 (AZ-436 / AC-NEW-9).
+
+The contract: across ``N`` synthetic flights — each carrying ``1-5 %`` of
+*poisoned* tiles (signing-cert mismatch, freshness violation, or voting
+disagreement) — the SUT MUST NOT silently emit a ``satellite_anchored``
+estimate that traces back to a poisoned tile (a *false-trust event*).
+
+Aggregate budget (Mode B Fact #103):
+
+    total_false_trust_events <= N * 1e-6
+
+At default CI N=1000 the budget is 0.001 expected events; the test
+therefore enforces the strict ``count == 0`` zero-tolerance default. The
+``release-gate`` N=10000 run keeps the same zero-tolerance default (the
+budget allows 0.01 events; one event is already a regression).
+
+Per-flight invariants additionally guarded by this evaluator:
+
+* poison ratio in ``[POISON_RATIO_MIN, POISON_RATIO_MAX]`` (AC-2);
+* at least one poisoned tile per **defense layer** in every flight
+  (AC-2 — each layer must be exercised so a per-layer regression cannot
+  hide behind layer-confusion);
+* per-event ``rejection_reason`` is one of the documented categories
+  (so an unlabeled false-positive cannot pass as a "rejection" in CSV
+  evidence).
+
+Public-boundary discipline: does NOT import any
+``src/gps_denied_onboard`` symbol. The evaluator only consumes
+runner-collected counter objects.
+"""
+
+from __future__ import annotations
+
+import csv
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Sequence
+
+FALSE_TRUST_BUDGET_PER_FLIGHT = 1e-6
+POISON_RATIO_MIN = 0.01
+POISON_RATIO_MAX = 0.05
+
+DEFENSE_LAYER_SIGNING = "signing_cert_mismatch"
+DEFENSE_LAYER_FRESHNESS = "freshness_violation"
+DEFENSE_LAYER_VOTING = "voting_disagreement"
+
+DEFENSE_LAYERS: tuple[str, ...] = (
+    DEFENSE_LAYER_SIGNING,
+    DEFENSE_LAYER_FRESHNESS,
+    DEFENSE_LAYER_VOTING,
+)
+
+REJECTION_REASONS: frozenset[str] = frozenset(
+    {
+        DEFENSE_LAYER_SIGNING,
+        DEFENSE_LAYER_FRESHNESS,
+        DEFENSE_LAYER_VOTING,
+        "freshness_gate_downgrade",
+        "service_voting_downgrade",
+    }
+)
+
+
+@dataclass(frozen=True)
+class PoisonedTileSpec:
+    """One poisoned tile slotted into a flight's tile cache."""
+
+    tile_id: str
+    defense_layer: str  # MUST be one of ``DEFENSE_LAYERS``
+
+
+@dataclass(frozen=True)
+class FalseTrustEvent:
+    """A frame where the SUT emitted ``satellite_anchored`` traced to a poisoned tile."""
+
+    flight_id: str
+    tile_id: str
+    monotonic_ms: int
+    defense_layer: str
+
+
+@dataclass(frozen=True)
+class FlightOutcome:
+    """One synthetic-flight result.
+
+    ``total_tile_count`` is the **cache size for that flight** (used to
+    compute the poison ratio). ``poisoned_tiles`` is the slate of
+    crafted tiles injected. ``false_trust_events`` are the runner-observed
+    frames where the SUT trusted a poisoned tile.
+
+    ``rejection_reasons`` is a counter of how often each documented
+    rejection-reason fired, taken from the runner's outbound
+    ``source_label`` capture + FDR signing-rejection events. Used in
+    evidence only (no AC assertion); a flight with zero rejection
+    events is suspicious but not necessarily a failure (the SUT may
+    have downgraded the candidate without naming the cause).
+    """
+
+    flight_id: str
+    total_tile_count: int
+    poisoned_tiles: Sequence[PoisonedTileSpec]
+    false_trust_events: Sequence[FalseTrustEvent]
+    rejection_reasons: dict[str, int] = field(default_factory=dict)
+
+    @property
+    def poison_ratio(self) -> float:
+        if self.total_tile_count <= 0:
+            return 0.0
+        return len(self.poisoned_tiles) / self.total_tile_count
+
+    @property
+    def defense_layers_present(self) -> set[str]:
+        return {p.defense_layer for p in self.poisoned_tiles}
+
+    @property
+    def false_trust_count(self) -> int:
+        return len(self.false_trust_events)
+
+    def has_unknown_rejection_reasons(self) -> bool:
+        return any(r not in REJECTION_REASONS for r in self.rejection_reasons)
+
+
+@dataclass(frozen=True)
+class CachePoisoningReport:
+    """Aggregate verdict over N flights."""
+
+    flights: Sequence[FlightOutcome]
+    flight_count: int
+    total_false_trust: int
+    flights_with_bad_poison_ratio: Sequence[str]
+    flights_missing_defense_layers: Sequence[tuple[str, list[str]]]
+    flights_with_unknown_rejection_reasons: Sequence[str]
+
+    @property
+    def budget(self) -> float:
+        return self.flight_count * FALSE_TRUST_BUDGET_PER_FLIGHT
+
+    @property
+    def passes_budget(self) -> bool:
+        # Zero-tolerance default: the budget at N=1000 is 0.001 expected
+        # events; one observed event is already a regression even on a
+        # purely floating-point reading of the budget.
+        return self.total_false_trust == 0
+
+    @property
+    def passes_ratio(self) -> bool:
+        return len(self.flights_with_bad_poison_ratio) == 0
+
+    @property
+    def passes_layer_coverage(self) -> bool:
+        return len(self.flights_missing_defense_layers) == 0
+
+    @property
+    def passes_rejection_reason_vocabulary(self) -> bool:
+        return len(self.flights_with_unknown_rejection_reasons) == 0
+
+    @property
+    def passes(self) -> bool:
+        return (
+            self.passes_budget
+            and self.passes_ratio
+            and self.passes_layer_coverage
+            and self.passes_rejection_reason_vocabulary
+        )
+
+
+def evaluate(flights: Sequence[FlightOutcome]) -> CachePoisoningReport:
+    """Compute the aggregate AC-1..AC-3 verdict for one Monte Carlo run."""
+    bad_ratio: list[str] = []
+    missing_layers: list[tuple[str, list[str]]] = []
+    unknown_reasons: list[str] = []
+    total_false_trust = 0
+    for flight in flights:
+        total_false_trust += flight.false_trust_count
+        ratio = flight.poison_ratio
+        if not (POISON_RATIO_MIN <= ratio <= POISON_RATIO_MAX):
+            bad_ratio.append(flight.flight_id)
+        missing = sorted(set(DEFENSE_LAYERS) - flight.defense_layers_present)
+        if missing:
+            missing_layers.append((flight.flight_id, missing))
+        if flight.has_unknown_rejection_reasons():
+            unknown_reasons.append(flight.flight_id)
+    return CachePoisoningReport(
+        flights=tuple(flights),
+        flight_count=len(flights),
+        total_false_trust=total_false_trust,
+        flights_with_bad_poison_ratio=tuple(bad_ratio),
+        flights_missing_defense_layers=tuple(missing_layers),
+        flights_with_unknown_rejection_reasons=tuple(unknown_reasons),
+    )
+
+
+def write_csv_evidence(out_path: Path, report: CachePoisoningReport) -> Path:
+    """Per-flight CSV — one row per flight + an aggregate footer row."""
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "flight_id",
+                "total_tile_count",
+                "poisoned_tile_count",
+                "poison_ratio",
+                "defense_layers_present",
+                "false_trust_count",
+                "rejection_reason_breakdown",
+            ]
+        )
+        for flight in report.flights:
+            layers_present = ",".join(sorted(flight.defense_layers_present)) or "<none>"
+            breakdown = ";".join(
+                f"{reason}={count}"
+                for reason, count in sorted(flight.rejection_reasons.items())
+            )
+            writer.writerow(
+                [
+                    flight.flight_id,
+                    flight.total_tile_count,
+                    len(flight.poisoned_tiles),
+                    f"{flight.poison_ratio:.4f}",
+                    layers_present,
+                    flight.false_trust_count,
+                    breakdown,
+                ]
+            )
+        writer.writerow([])
+        writer.writerow(
+            [
+                "AGGREGATE",
+                f"flight_count={report.flight_count}",
+                f"total_false_trust={report.total_false_trust}",
+                f"budget={report.budget:g}",
+                f"passes_budget={'true' if report.passes_budget else 'false'}",
+                f"passes_ratio={'true' if report.passes_ratio else 'false'}",
+                f"passes_layer_coverage={'true' if report.passes_layer_coverage else 'false'}",
+            ]
+        )
+    return out_path
@@ -0,0 +1,143 @@
+"""OpenCV CVE-2025-53644 probe evaluator for NFT-SEC-04 (AZ-439 / RESTRICT-CVE-1).
+
+The probe feeds the canonical ``cve-jpeg-fixture`` (a single crafted
+JPEG known to trigger the upstream OpenCV vulnerability) into the SUT's
+nav-camera input and asserts:
+
+* AC-1a: the SUT process remains **alive** after the frame (no crash,
+  no abort, no segfault — the FDR archive contains AT LEAST one record
+  whose monotonic_ms strictly post-dates the probe);
+* AC-1b: the FDR contains a deterministic outcome for the probe frame
+  — either a ``decode-success`` record (the patched OpenCV decoded
+  the crafted JPEG) OR a ``frame-decode-error`` record (the SUT
+  gracefully rejected the malformed payload). A silent drop (no FDR
+  record at all) is a failure, because it means the SUT swallowed the
+  frame without honoring its error-reporting contract.
+
+Public-boundary discipline: does NOT import any
+``src/gps_denied_onboard`` symbol. The evaluator consumes only the
+runner-collected ``FdrSurvivalRecord`` summaries the FDR-reader helper
+already produces for other scenarios.
+"""
+
+from __future__ import annotations
+
+import csv
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Sequence
+
+
+class ProbeFrameOutcome(str, Enum):
+    """The deterministic per-frame outcome the SUT must record."""
+
+    DECODE_SUCCESS = "decode-success"
+    FRAME_DECODE_ERROR = "frame-decode-error"
+    MISSING = "missing"  # silent drop — fails AC-1b
+
+
+@dataclass(frozen=True)
+class FdrSurvivalRecord:
+    """One FDR record from the runner-collected archive."""
+
+    monotonic_ms: int
+    kind: str  # e.g. "frame-decode-success" or "frame-decode-error"
+
+
+@dataclass(frozen=True)
+class CveProbeReport:
+    """Verdict for one ``cve-jpeg-fixture`` injection."""
+
+    probe_injected_at_ms: int
+    last_fdr_record_at_ms: int | None
+    probe_outcome: ProbeFrameOutcome
+
+    @property
+    def passes_no_crash(self) -> bool:
+        return (
+            self.last_fdr_record_at_ms is not None
+            and self.last_fdr_record_at_ms >= self.probe_injected_at_ms
+        )
+
+    @property
+    def passes_graceful_outcome(self) -> bool:
+        return self.probe_outcome in (
+            ProbeFrameOutcome.DECODE_SUCCESS,
+            ProbeFrameOutcome.FRAME_DECODE_ERROR,
+        )
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_no_crash and self.passes_graceful_outcome
+
+
+def classify_probe_outcome(
+    fdr_records: Sequence[FdrSurvivalRecord],
+    *,
+    probe_injected_at_ms: int,
+    tolerance_ms: int = 50,
+) -> ProbeFrameOutcome:
+    """Pick the FDR record nearest the probe injection and classify it.
+
+    A record is considered ``for the probe`` if its monotonic timestamp
+    lies within ``[probe_injected_at_ms, probe_injected_at_ms + tolerance_ms]``.
+    If no record falls in that window the outcome is ``MISSING`` —
+    which fails AC-1b regardless of the no-crash check.
+    """
+    for record in fdr_records:
+        if record.monotonic_ms < probe_injected_at_ms:
+            continue
+        if record.monotonic_ms > probe_injected_at_ms + tolerance_ms:
+            continue
+        if "decode-success" in record.kind:
+            return ProbeFrameOutcome.DECODE_SUCCESS
+        if "decode-error" in record.kind:
+            return ProbeFrameOutcome.FRAME_DECODE_ERROR
+    return ProbeFrameOutcome.MISSING
+
+
+def evaluate(
+    fdr_records: Sequence[FdrSurvivalRecord],
+    *,
+    probe_injected_at_ms: int,
+    tolerance_ms: int = 50,
+) -> CveProbeReport:
+    last_record_at = max((r.monotonic_ms for r in fdr_records), default=None)
+    outcome = classify_probe_outcome(
+        fdr_records,
+        probe_injected_at_ms=probe_injected_at_ms,
+        tolerance_ms=tolerance_ms,
+    )
+    return CveProbeReport(
+        probe_injected_at_ms=probe_injected_at_ms,
+        last_fdr_record_at_ms=last_record_at,
+        probe_outcome=outcome,
+    )
+
+
+def write_csv_evidence(out_path: Path, report: CveProbeReport) -> Path:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "probe_injected_at_ms",
+                "last_fdr_record_at_ms",
+                "probe_outcome",
+                "passes_no_crash",
+                "passes_graceful_outcome",
+                "passes",
+            ]
+        )
+        writer.writerow(
+            [
+                report.probe_injected_at_ms,
+                "" if report.last_fdr_record_at_ms is None else report.last_fdr_record_at_ms,
+                report.probe_outcome.value,
+                "true" if report.passes_no_crash else "false",
+                "true" if report.passes_graceful_outcome else "false",
+                "true" if report.passes else "false",
+            ]
+        )
+    return out_path
@@ -0,0 +1,219 @@
+"""Egress-observation evaluator shared by NFT-SEC-02 and NFT-SEC-05 (AZ-437).
+
+Both scenarios verify the same invariant: **no packets leave the e2e
+Docker network** from the SUT container. NFT-SEC-02 reads the Docker
+network-stats counter over a 5-min Derkachi replay. NFT-SEC-05 runs a
+``nslookup`` probe inside the SUT container's network namespace and
+checks that (a) the lookup fails and (b) no UDP-53 packets escape the
+host's outbound interface during the probe.
+
+The observation pattern is identical in both cases: take a *before*
+counter snapshot, run the workload, take an *after* snapshot, assert
+``after - before == 0`` for the relevant counter family. The runner is
+responsible for the actual ``docker network inspect`` / ``ip -s link``
+collection; this helper only performs the delta + verdict logic so the
+scenario code stays tight and the verdict logic is unit-testable in
+isolation.
+
+DNS-resolution outcome categories follow the spec's wording (NXDOMAIN,
+timeout, "no servers can be reached") + a generic "other failure" bucket
+for resolver implementations that emit a different string but still
+fail. A *success* outcome — i.e. an actual A record returned — is the
+only failing case.
+
+Public-boundary discipline: does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+
+
+class DnsLookupOutcome(str, Enum):
+    """The runner-classified outcome of a ``nslookup`` probe."""
+
+    NXDOMAIN = "nxdomain"
+    TIMEOUT = "timeout"
+    NO_SERVERS = "no_servers_can_be_reached"
+    OTHER_FAILURE = "other_failure"
+    SUCCESS = "success"  # the only outcome that fails the AC
+
+
+FAILING_DNS_OUTCOMES: frozenset[DnsLookupOutcome] = frozenset(
+    {
+        DnsLookupOutcome.NXDOMAIN,
+        DnsLookupOutcome.TIMEOUT,
+        DnsLookupOutcome.NO_SERVERS,
+        DnsLookupOutcome.OTHER_FAILURE,
+    }
+)
+
+
+@dataclass(frozen=True)
+class EgressCounterSnapshot:
+    """One snapshot of egress-byte / packet counters on the SUT-facing interface."""
+
+    egress_packets_to_internal_net: int
+    egress_packets_to_other_destinations: int
+    udp53_egress_packets: int
+
+    def __post_init__(self) -> None:
+        for field_name in (
+            "egress_packets_to_internal_net",
+            "egress_packets_to_other_destinations",
+            "udp53_egress_packets",
+        ):
+            value = getattr(self, field_name)
+            if value < 0:
+                raise ValueError(
+                    f"egress counter {field_name!r} cannot be negative; got {value}"
+                )
+
+
+@dataclass(frozen=True)
+class NoEgressReport:
+    """NFT-SEC-02 verdict — zero packets to non-internal destinations during the window."""
+
+    before: EgressCounterSnapshot
+    after: EgressCounterSnapshot
+    window_label: str  # e.g. "5min-derkachi-replay"
+
+    @property
+    def delta_other_destinations(self) -> int:
+        return (
+            self.after.egress_packets_to_other_destinations
+            - self.before.egress_packets_to_other_destinations
+        )
+
+    @property
+    def delta_internal(self) -> int:
+        return (
+            self.after.egress_packets_to_internal_net
+            - self.before.egress_packets_to_internal_net
+        )
+
+    @property
+    def passes(self) -> bool:
+        return self.delta_other_destinations == 0
+
+
+@dataclass(frozen=True)
+class DnsBlackholeReport:
+    """NFT-SEC-05 verdict — lookup fails AND no UDP-53 packets escape."""
+
+    before: EgressCounterSnapshot
+    after: EgressCounterSnapshot
+    lookup_outcome: DnsLookupOutcome
+    sidecar_healthy: bool
+
+    @property
+    def delta_udp53(self) -> int:
+        return self.after.udp53_egress_packets - self.before.udp53_egress_packets
+
+    @property
+    def passes_lookup(self) -> bool:
+        return self.lookup_outcome in FAILING_DNS_OUTCOMES
+
+    @property
+    def passes_udp_silence(self) -> bool:
+        return self.delta_udp53 == 0
+
+    @property
+    def passes(self) -> bool:
+        return (
+            self.sidecar_healthy and self.passes_lookup and self.passes_udp_silence
+        )
+
+
+def evaluate_no_egress(
+    before: EgressCounterSnapshot,
+    after: EgressCounterSnapshot,
+    *,
+    window_label: str,
+) -> NoEgressReport:
+    """AC-1 verdict for NFT-SEC-02."""
+    return NoEgressReport(before=before, after=after, window_label=window_label)
+
+
+def evaluate_dns_blackhole(
+    before: EgressCounterSnapshot,
+    after: EgressCounterSnapshot,
+    *,
+    lookup_outcome: DnsLookupOutcome,
+    sidecar_healthy: bool,
+) -> DnsBlackholeReport:
+    """AC-2 + AC-3 verdict for NFT-SEC-05."""
+    return DnsBlackholeReport(
+        before=before,
+        after=after,
+        lookup_outcome=lookup_outcome,
+        sidecar_healthy=sidecar_healthy,
+    )
+
+
+def write_no_egress_csv_evidence(out_path: Path, report: NoEgressReport) -> Path:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "window_label",
+                "before_other",
+                "after_other",
+                "delta_other",
+                "before_internal",
+                "after_internal",
+                "delta_internal",
+                "passes",
+            ]
+        )
+        writer.writerow(
+            [
+                report.window_label,
+                report.before.egress_packets_to_other_destinations,
+                report.after.egress_packets_to_other_destinations,
+                report.delta_other_destinations,
+                report.before.egress_packets_to_internal_net,
+                report.after.egress_packets_to_internal_net,
+                report.delta_internal,
+                "true" if report.passes else "false",
+            ]
+        )
+    return out_path
+
+
+def write_dns_blackhole_csv_evidence(
+    out_path: Path, report: DnsBlackholeReport
+) -> Path:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "sidecar_healthy",
+                "lookup_outcome",
+                "passes_lookup",
+                "before_udp53",
+                "after_udp53",
+                "delta_udp53",
+                "passes_udp_silence",
+                "passes",
+            ]
+        )
+        writer.writerow(
+            [
+                "true" if report.sidecar_healthy else "false",
+                report.lookup_outcome.value,
+                "true" if report.passes_lookup else "false",
+                report.before.udp53_egress_packets,
+                report.after.udp53_egress_packets,
+                report.delta_udp53,
+                "true" if report.passes_udp_silence else "false",
+                "true" if report.passes else "false",
+            ]
+        )
+    return out_path
@@ -0,0 +1,217 @@
+"""MAVLink 2.0 signing-rejection evaluator for NFT-SEC-03 (AZ-438 / AC-NEW-11, D-C8-9).
+
+For each of the three injection sub-cases — unsigned, signed-with-wrong-key,
+replayed-from-tlog — AP MUST:
+
+* emit a ``BAD_SIGNATURE`` STATUSTEXT within ≤``REJECTION_LATENCY_MS``
+  (500 ms) of the injected message;
+* NOT update its ``GLOBAL_POSITION_INT`` from the injected message
+  (i.e. the GPS position remains anchored to whatever the last legitimate
+  emission established).
+
+The "rejection STATUSTEXT" regex matches the canonical AP wording (e.g.
+``MAVLink: BAD_SIGNATURE``) plus an "equivalent" wildcard that the
+spec carves out for AP variants that emit a slightly different phrase
+("Bad signature received", "signature rejected", etc.) so this evaluator
+does not lock to one exact build.
+
+Public-boundary discipline: does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+import re
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Sequence
+
+REJECTION_LATENCY_MS = 500
+POSITION_DRIFT_TOLERANCE_M = 1.0
+
+
+class SubCase(str, Enum):
+    UNSIGNED = "unsigned"
+    WRONG_KEY = "wrong_key"
+    REPLAYED = "replayed"
+
+
+# Canonical wording + variants observed across AP builds. Matched
+# case-insensitively. Extending this set is a deliberate decision; the
+# regression risk is that a future AP build emits a brand-new phrase
+# and the runner silently treats injections as accepted — guard against
+# that by surfacing the seen STATUSTEXTs in the CSV evidence row.
+BAD_SIGNATURE_PATTERNS: tuple[str, ...] = (
+    r"\bBAD[_\s]?SIGNATURE\b",
+    r"\bsignature\s+rejected\b",
+    r"\bbad\s+signature\s+received\b",
+)
+_BAD_SIGNATURE_RE = re.compile("|".join(BAD_SIGNATURE_PATTERNS), re.IGNORECASE)
+
+
+def is_bad_signature_statustext(text: str) -> bool:
+    """True iff the STATUSTEXT line matches one of the documented rejections."""
+    return bool(_BAD_SIGNATURE_RE.search(text))
+
+
+@dataclass(frozen=True)
+class InjectionEvent:
+    """One runner-issued injection in a sub-case."""
+
+    sub_case: SubCase
+    injected_at_ms: int
+
+
+@dataclass(frozen=True)
+class StatustextSample:
+    monotonic_ms: int
+    text: str
+
+
+@dataclass(frozen=True)
+class PositionSample:
+    """AP ``GLOBAL_POSITION_INT`` sample, decoded to meters."""
+
+    monotonic_ms: int
+    lat_e7: int
+    lon_e7: int
+
+
+def position_drift_m(samples: Sequence[PositionSample], around_ms: int) -> float:
+    """Equirectangular drift (m) between the last sample before and first after ``around_ms``.
+
+    A tiny budget (1 m by default) tolerates the per-frame jitter the
+    autopilot's own EKF produces; the absolute test is that the drift
+    is NOT on the order of the injected message's lat/lon magnitude
+    (which would be several-meters to kilometers).
+    """
+    before: PositionSample | None = None
+    after: PositionSample | None = None
+    for s in samples:
+        if s.monotonic_ms <= around_ms:
+            before = s if before is None or s.monotonic_ms > before.monotonic_ms else before
+        elif after is None:
+            after = s
+            break
+    if before is None or after is None:
+        return 0.0
+    dlat_m = (after.lat_e7 - before.lat_e7) * 1e-7 * 111_320.0
+    avg_lat_rad = ((after.lat_e7 + before.lat_e7) / 2.0) * 1e-7 * (3.14159265358979 / 180.0)
+    import math
+
+    dlon_m = (after.lon_e7 - before.lon_e7) * 1e-7 * 111_320.0 * math.cos(avg_lat_rad)
+    return math.hypot(dlat_m, dlon_m)
+
+
+@dataclass(frozen=True)
+class SubCaseRejectionReport:
+    """One sub-case verdict (AC-2 / AC-3 / AC-4)."""
+
+    sub_case: SubCase
+    rejection_at_ms: int | None
+    rejection_text: str | None
+    rejection_latency_ms: int | None
+    position_drift_m: float
+    budget_ms: int = REJECTION_LATENCY_MS
+
+    @property
+    def passes_rejection(self) -> bool:
+        return (
+            self.rejection_at_ms is not None
+            and self.rejection_latency_ms is not None
+            and self.rejection_latency_ms <= self.budget_ms
+        )
+
+    @property
+    def passes_no_position_update(self) -> bool:
+        return self.position_drift_m <= POSITION_DRIFT_TOLERANCE_M
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_rejection and self.passes_no_position_update
+
+
+@dataclass(frozen=True)
+class SigningRejectionReport:
+    """Aggregate AC-2 + AC-3 + AC-4 verdict across all sub-cases."""
+
+    sub_cases: Sequence[SubCaseRejectionReport]
+
+    @property
+    def passes(self) -> bool:
+        return all(sc.passes for sc in self.sub_cases)
+
+
+def evaluate_subcase(
+    injection: InjectionEvent,
+    statustexts: Sequence[StatustextSample],
+    positions: Sequence[PositionSample],
+) -> SubCaseRejectionReport:
+    """Compute verdict for one (injection, capture) pair."""
+    rejection_at: int | None = None
+    rejection_text: str | None = None
+    rejection_latency: int | None = None
+    for st in statustexts:
+        if st.monotonic_ms < injection.injected_at_ms:
+            continue
+        if is_bad_signature_statustext(st.text):
+            rejection_at = st.monotonic_ms
+            rejection_text = st.text
+            rejection_latency = st.monotonic_ms - injection.injected_at_ms
+            break
+    drift = position_drift_m(positions, injection.injected_at_ms)
+    return SubCaseRejectionReport(
+        sub_case=injection.sub_case,
+        rejection_at_ms=rejection_at,
+        rejection_text=rejection_text,
+        rejection_latency_ms=rejection_latency,
+        position_drift_m=drift,
+    )
+
+
+def evaluate(
+    injections: Sequence[InjectionEvent],
+    *,
+    statustexts: Sequence[StatustextSample],
+    positions: Sequence[PositionSample],
+) -> SigningRejectionReport:
+    sub_reports: list[SubCaseRejectionReport] = []
+    for inj in injections:
+        sub_reports.append(
+            evaluate_subcase(inj, statustexts=statustexts, positions=positions)
+        )
+    return SigningRejectionReport(sub_cases=tuple(sub_reports))
+
+
+def write_csv_evidence(out_path: Path, report: SigningRejectionReport) -> Path:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "sub_case",
+                "rejection_at_ms",
+                "rejection_latency_ms",
+                "rejection_text",
+                "position_drift_m",
+                "passes_rejection",
+                "passes_no_position_update",
+                "passes",
+            ]
+        )
+        for sc in report.sub_cases:
+            writer.writerow(
+                [
+                    sc.sub_case.value,
+                    "" if sc.rejection_at_ms is None else sc.rejection_at_ms,
+                    "" if sc.rejection_latency_ms is None else sc.rejection_latency_ms,
+                    sc.rejection_text or "",
+                    f"{sc.position_drift_m:.4f}",
+                    "true" if sc.passes_rejection else "false",
+                    "true" if sc.passes_no_position_update else "false",
+                    "true" if sc.passes else "false",
+                ]
+            )
+    return out_path