Files
Oleksandr Bezdieniezhnykh c56d4584e6 [AZ-436] [AZ-437] [AZ-438] [AZ-439] Add NFT-SEC-01..05 security scenarios
Batch 87: 6 NFT-SEC blackbox scenarios + 5 helper evaluators + 75 unit
tests + cumulative review batches 85-87.

* AZ-436 NFT-SEC-01: cache-poisoning safety budget (AC-NEW-9); aggregate
  false_trust_count ≤ N×1e-6; zero-tolerance default. Canonical-only by
  default; E2E_NFT_SEC_01_RELEASE_GATE=1 unlocks full matrix.
* AZ-437 NFT-SEC-02 + NFT-SEC-05: shared egress-observation evaluator
  (AC-NEW-10); SEC-02 = 0 packets to non-e2e-net over 5min replay;
  SEC-05 = DNS-blackhole sidecar healthy + lookup fails + UDP-53 silent.
* AZ-438 NFT-SEC-03: AP-only signing rejection (AC-NEW-11); 3 sub-cases
  (unsigned/wrong-key/replayed) each reject ≤500ms + no position drift.
* AZ-439 NFT-SEC-04: probe (always-run) = no-crash + deterministic
  decode outcome; ASan-fuzz (release-gate) = 0 findings ≥4h; AC-3
  corpus floor informational only per spec.

Verdict per-batch: PASS_WITH_WARNINGS (5 Low). Cumulative review for
batches 85-87 (K=3 window) also PASS_WITH_WARNINGS with 5 cross-batch
findings — recommends hygiene PBIs for write_csv_evidence duplication
(13 helpers) and _resolve_fixture_path duplication (13 scenarios), plus
new tickets for AZ-595 fixture builder + DNS-blackhole sidecar service.

Also adds _docs/LESSONS.md documenting the Jira transition-ID lesson
(always call getTransitionsForJiraIssue first, never memorize numeric
IDs across sessions).

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-17 17:33:22 +03:00

239 lines
8.1 KiB
Python

"""Cache-poisoning safety-budget evaluator for NFT-SEC-01 (AZ-436 / AC-NEW-9).
The contract: across ``N`` synthetic flights — each carrying ``1-5 %`` of
*poisoned* tiles (signing-cert mismatch, freshness violation, or voting
disagreement) — the SUT MUST NOT silently emit a ``satellite_anchored``
estimate that traces back to a poisoned tile (a *false-trust event*).
Aggregate budget (Mode B Fact #103):
total_false_trust_events <= N * 1e-6
At default CI N=1000 the budget is 0.001 expected events; the test
therefore enforces the strict ``count == 0`` zero-tolerance default. The
``release-gate`` N=10000 run keeps the same zero-tolerance default (the
budget allows 0.01 events; one event is already a regression).
Per-flight invariants additionally guarded by this evaluator:
* poison ratio in ``[POISON_RATIO_MIN, POISON_RATIO_MAX]`` (AC-2);
* at least one poisoned tile per **defense layer** in every flight
(AC-2 — each layer must be exercised so a per-layer regression cannot
hide behind layer-confusion);
* per-event ``rejection_reason`` is one of the documented categories
(so an unlabeled false-positive cannot pass as a "rejection" in CSV
evidence).
Public-boundary discipline: does NOT import any
``src/gps_denied_onboard`` symbol. The evaluator only consumes
runner-collected counter objects.
"""
from __future__ import annotations
import csv
from dataclasses import dataclass, field
from pathlib import Path
from typing import Sequence
FALSE_TRUST_BUDGET_PER_FLIGHT = 1e-6
POISON_RATIO_MIN = 0.01
POISON_RATIO_MAX = 0.05
DEFENSE_LAYER_SIGNING = "signing_cert_mismatch"
DEFENSE_LAYER_FRESHNESS = "freshness_violation"
DEFENSE_LAYER_VOTING = "voting_disagreement"
DEFENSE_LAYERS: tuple[str, ...] = (
DEFENSE_LAYER_SIGNING,
DEFENSE_LAYER_FRESHNESS,
DEFENSE_LAYER_VOTING,
)
REJECTION_REASONS: frozenset[str] = frozenset(
{
DEFENSE_LAYER_SIGNING,
DEFENSE_LAYER_FRESHNESS,
DEFENSE_LAYER_VOTING,
"freshness_gate_downgrade",
"service_voting_downgrade",
}
)
@dataclass(frozen=True)
class PoisonedTileSpec:
"""One poisoned tile slotted into a flight's tile cache."""
tile_id: str
defense_layer: str # MUST be one of ``DEFENSE_LAYERS``
@dataclass(frozen=True)
class FalseTrustEvent:
"""A frame where the SUT emitted ``satellite_anchored`` traced to a poisoned tile."""
flight_id: str
tile_id: str
monotonic_ms: int
defense_layer: str
@dataclass(frozen=True)
class FlightOutcome:
"""One synthetic-flight result.
``total_tile_count`` is the **cache size for that flight** (used to
compute the poison ratio). ``poisoned_tiles`` is the slate of
crafted tiles injected. ``false_trust_events`` are the runner-observed
frames where the SUT trusted a poisoned tile.
``rejection_reasons`` is a counter of how often each documented
rejection-reason fired, taken from the runner's outbound
``source_label`` capture + FDR signing-rejection events. Used in
evidence only (no AC assertion); a flight with zero rejection
events is suspicious but not necessarily a failure (the SUT may
have downgraded the candidate without naming the cause).
"""
flight_id: str
total_tile_count: int
poisoned_tiles: Sequence[PoisonedTileSpec]
false_trust_events: Sequence[FalseTrustEvent]
rejection_reasons: dict[str, int] = field(default_factory=dict)
@property
def poison_ratio(self) -> float:
if self.total_tile_count <= 0:
return 0.0
return len(self.poisoned_tiles) / self.total_tile_count
@property
def defense_layers_present(self) -> set[str]:
return {p.defense_layer for p in self.poisoned_tiles}
@property
def false_trust_count(self) -> int:
return len(self.false_trust_events)
def has_unknown_rejection_reasons(self) -> bool:
return any(r not in REJECTION_REASONS for r in self.rejection_reasons)
@dataclass(frozen=True)
class CachePoisoningReport:
"""Aggregate verdict over N flights."""
flights: Sequence[FlightOutcome]
flight_count: int
total_false_trust: int
flights_with_bad_poison_ratio: Sequence[str]
flights_missing_defense_layers: Sequence[tuple[str, list[str]]]
flights_with_unknown_rejection_reasons: Sequence[str]
@property
def budget(self) -> float:
return self.flight_count * FALSE_TRUST_BUDGET_PER_FLIGHT
@property
def passes_budget(self) -> bool:
# Zero-tolerance default: the budget at N=1000 is 0.001 expected
# events; one observed event is already a regression even on a
# purely floating-point reading of the budget.
return self.total_false_trust == 0
@property
def passes_ratio(self) -> bool:
return len(self.flights_with_bad_poison_ratio) == 0
@property
def passes_layer_coverage(self) -> bool:
return len(self.flights_missing_defense_layers) == 0
@property
def passes_rejection_reason_vocabulary(self) -> bool:
return len(self.flights_with_unknown_rejection_reasons) == 0
@property
def passes(self) -> bool:
return (
self.passes_budget
and self.passes_ratio
and self.passes_layer_coverage
and self.passes_rejection_reason_vocabulary
)
def evaluate(flights: Sequence[FlightOutcome]) -> CachePoisoningReport:
"""Compute the aggregate AC-1..AC-3 verdict for one Monte Carlo run."""
bad_ratio: list[str] = []
missing_layers: list[tuple[str, list[str]]] = []
unknown_reasons: list[str] = []
total_false_trust = 0
for flight in flights:
total_false_trust += flight.false_trust_count
ratio = flight.poison_ratio
if not (POISON_RATIO_MIN <= ratio <= POISON_RATIO_MAX):
bad_ratio.append(flight.flight_id)
missing = sorted(set(DEFENSE_LAYERS) - flight.defense_layers_present)
if missing:
missing_layers.append((flight.flight_id, missing))
if flight.has_unknown_rejection_reasons():
unknown_reasons.append(flight.flight_id)
return CachePoisoningReport(
flights=tuple(flights),
flight_count=len(flights),
total_false_trust=total_false_trust,
flights_with_bad_poison_ratio=tuple(bad_ratio),
flights_missing_defense_layers=tuple(missing_layers),
flights_with_unknown_rejection_reasons=tuple(unknown_reasons),
)
def write_csv_evidence(out_path: Path, report: CachePoisoningReport) -> Path:
"""Per-flight CSV — one row per flight + an aggregate footer row."""
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", newline="") as fh:
writer = csv.writer(fh)
writer.writerow(
[
"flight_id",
"total_tile_count",
"poisoned_tile_count",
"poison_ratio",
"defense_layers_present",
"false_trust_count",
"rejection_reason_breakdown",
]
)
for flight in report.flights:
layers_present = ",".join(sorted(flight.defense_layers_present)) or "<none>"
breakdown = ";".join(
f"{reason}={count}"
for reason, count in sorted(flight.rejection_reasons.items())
)
writer.writerow(
[
flight.flight_id,
flight.total_tile_count,
len(flight.poisoned_tiles),
f"{flight.poison_ratio:.4f}",
layers_present,
flight.false_trust_count,
breakdown,
]
)
writer.writerow([])
writer.writerow(
[
"AGGREGATE",
f"flight_count={report.flight_count}",
f"total_false_trust={report.total_false_trust}",
f"budget={report.budget:g}",
f"passes_budget={'true' if report.passes_budget else 'false'}",
f"passes_ratio={'true' if report.passes_ratio else 'false'}",
f"passes_layer_coverage={'true' if report.passes_layer_coverage else 'false'}",
]
)
return out_path