mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 23:31:13 +00:00
c56d4584e6
Batch 87: 6 NFT-SEC blackbox scenarios + 5 helper evaluators + 75 unit tests + cumulative review batches 85-87. * AZ-436 NFT-SEC-01: cache-poisoning safety budget (AC-NEW-9); aggregate false_trust_count ≤ N×1e-6; zero-tolerance default. Canonical-only by default; E2E_NFT_SEC_01_RELEASE_GATE=1 unlocks full matrix. * AZ-437 NFT-SEC-02 + NFT-SEC-05: shared egress-observation evaluator (AC-NEW-10); SEC-02 = 0 packets to non-e2e-net over 5min replay; SEC-05 = DNS-blackhole sidecar healthy + lookup fails + UDP-53 silent. * AZ-438 NFT-SEC-03: AP-only signing rejection (AC-NEW-11); 3 sub-cases (unsigned/wrong-key/replayed) each reject ≤500ms + no position drift. * AZ-439 NFT-SEC-04: probe (always-run) = no-crash + deterministic decode outcome; ASan-fuzz (release-gate) = 0 findings ≥4h; AC-3 corpus floor informational only per spec. Verdict per-batch: PASS_WITH_WARNINGS (5 Low). Cumulative review for batches 85-87 (K=3 window) also PASS_WITH_WARNINGS with 5 cross-batch findings — recommends hygiene PBIs for write_csv_evidence duplication (13 helpers) and _resolve_fixture_path duplication (13 scenarios), plus new tickets for AZ-595 fixture builder + DNS-blackhole sidecar service. Also adds _docs/LESSONS.md documenting the Jira transition-ID lesson (always call getTransitionsForJiraIssue first, never memorize numeric IDs across sessions). Co-authored-by: Cursor <cursoragent@cursor.com>
239 lines
8.1 KiB
Python
239 lines
8.1 KiB
Python
"""Cache-poisoning safety-budget evaluator for NFT-SEC-01 (AZ-436 / AC-NEW-9).
|
|
|
|
The contract: across ``N`` synthetic flights — each carrying ``1-5 %`` of
|
|
*poisoned* tiles (signing-cert mismatch, freshness violation, or voting
|
|
disagreement) — the SUT MUST NOT silently emit a ``satellite_anchored``
|
|
estimate that traces back to a poisoned tile (a *false-trust event*).
|
|
|
|
Aggregate budget (Mode B Fact #103):
|
|
|
|
total_false_trust_events <= N * 1e-6
|
|
|
|
At default CI N=1000 the budget is 0.001 expected events; the test
|
|
therefore enforces the strict ``count == 0`` zero-tolerance default. The
|
|
``release-gate`` N=10000 run keeps the same zero-tolerance default (the
|
|
budget allows 0.01 events; one event is already a regression).
|
|
|
|
Per-flight invariants additionally guarded by this evaluator:
|
|
|
|
* poison ratio in ``[POISON_RATIO_MIN, POISON_RATIO_MAX]`` (AC-2);
|
|
* at least one poisoned tile per **defense layer** in every flight
|
|
(AC-2 — each layer must be exercised so a per-layer regression cannot
|
|
hide behind layer-confusion);
|
|
* per-event ``rejection_reason`` is one of the documented categories
|
|
(so an unlabeled false-positive cannot pass as a "rejection" in CSV
|
|
evidence).
|
|
|
|
Public-boundary discipline: does NOT import any
|
|
``src/gps_denied_onboard`` symbol. The evaluator only consumes
|
|
runner-collected counter objects.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Sequence
|
|
|
|
FALSE_TRUST_BUDGET_PER_FLIGHT = 1e-6
|
|
POISON_RATIO_MIN = 0.01
|
|
POISON_RATIO_MAX = 0.05
|
|
|
|
DEFENSE_LAYER_SIGNING = "signing_cert_mismatch"
|
|
DEFENSE_LAYER_FRESHNESS = "freshness_violation"
|
|
DEFENSE_LAYER_VOTING = "voting_disagreement"
|
|
|
|
DEFENSE_LAYERS: tuple[str, ...] = (
|
|
DEFENSE_LAYER_SIGNING,
|
|
DEFENSE_LAYER_FRESHNESS,
|
|
DEFENSE_LAYER_VOTING,
|
|
)
|
|
|
|
REJECTION_REASONS: frozenset[str] = frozenset(
|
|
{
|
|
DEFENSE_LAYER_SIGNING,
|
|
DEFENSE_LAYER_FRESHNESS,
|
|
DEFENSE_LAYER_VOTING,
|
|
"freshness_gate_downgrade",
|
|
"service_voting_downgrade",
|
|
}
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PoisonedTileSpec:
|
|
"""One poisoned tile slotted into a flight's tile cache."""
|
|
|
|
tile_id: str
|
|
defense_layer: str # MUST be one of ``DEFENSE_LAYERS``
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FalseTrustEvent:
|
|
"""A frame where the SUT emitted ``satellite_anchored`` traced to a poisoned tile."""
|
|
|
|
flight_id: str
|
|
tile_id: str
|
|
monotonic_ms: int
|
|
defense_layer: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FlightOutcome:
|
|
"""One synthetic-flight result.
|
|
|
|
``total_tile_count`` is the **cache size for that flight** (used to
|
|
compute the poison ratio). ``poisoned_tiles`` is the slate of
|
|
crafted tiles injected. ``false_trust_events`` are the runner-observed
|
|
frames where the SUT trusted a poisoned tile.
|
|
|
|
``rejection_reasons`` is a counter of how often each documented
|
|
rejection-reason fired, taken from the runner's outbound
|
|
``source_label`` capture + FDR signing-rejection events. Used in
|
|
evidence only (no AC assertion); a flight with zero rejection
|
|
events is suspicious but not necessarily a failure (the SUT may
|
|
have downgraded the candidate without naming the cause).
|
|
"""
|
|
|
|
flight_id: str
|
|
total_tile_count: int
|
|
poisoned_tiles: Sequence[PoisonedTileSpec]
|
|
false_trust_events: Sequence[FalseTrustEvent]
|
|
rejection_reasons: dict[str, int] = field(default_factory=dict)
|
|
|
|
@property
|
|
def poison_ratio(self) -> float:
|
|
if self.total_tile_count <= 0:
|
|
return 0.0
|
|
return len(self.poisoned_tiles) / self.total_tile_count
|
|
|
|
@property
|
|
def defense_layers_present(self) -> set[str]:
|
|
return {p.defense_layer for p in self.poisoned_tiles}
|
|
|
|
@property
|
|
def false_trust_count(self) -> int:
|
|
return len(self.false_trust_events)
|
|
|
|
def has_unknown_rejection_reasons(self) -> bool:
|
|
return any(r not in REJECTION_REASONS for r in self.rejection_reasons)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class CachePoisoningReport:
|
|
"""Aggregate verdict over N flights."""
|
|
|
|
flights: Sequence[FlightOutcome]
|
|
flight_count: int
|
|
total_false_trust: int
|
|
flights_with_bad_poison_ratio: Sequence[str]
|
|
flights_missing_defense_layers: Sequence[tuple[str, list[str]]]
|
|
flights_with_unknown_rejection_reasons: Sequence[str]
|
|
|
|
@property
|
|
def budget(self) -> float:
|
|
return self.flight_count * FALSE_TRUST_BUDGET_PER_FLIGHT
|
|
|
|
@property
|
|
def passes_budget(self) -> bool:
|
|
# Zero-tolerance default: the budget at N=1000 is 0.001 expected
|
|
# events; one observed event is already a regression even on a
|
|
# purely floating-point reading of the budget.
|
|
return self.total_false_trust == 0
|
|
|
|
@property
|
|
def passes_ratio(self) -> bool:
|
|
return len(self.flights_with_bad_poison_ratio) == 0
|
|
|
|
@property
|
|
def passes_layer_coverage(self) -> bool:
|
|
return len(self.flights_missing_defense_layers) == 0
|
|
|
|
@property
|
|
def passes_rejection_reason_vocabulary(self) -> bool:
|
|
return len(self.flights_with_unknown_rejection_reasons) == 0
|
|
|
|
@property
|
|
def passes(self) -> bool:
|
|
return (
|
|
self.passes_budget
|
|
and self.passes_ratio
|
|
and self.passes_layer_coverage
|
|
and self.passes_rejection_reason_vocabulary
|
|
)
|
|
|
|
|
|
def evaluate(flights: Sequence[FlightOutcome]) -> CachePoisoningReport:
|
|
"""Compute the aggregate AC-1..AC-3 verdict for one Monte Carlo run."""
|
|
bad_ratio: list[str] = []
|
|
missing_layers: list[tuple[str, list[str]]] = []
|
|
unknown_reasons: list[str] = []
|
|
total_false_trust = 0
|
|
for flight in flights:
|
|
total_false_trust += flight.false_trust_count
|
|
ratio = flight.poison_ratio
|
|
if not (POISON_RATIO_MIN <= ratio <= POISON_RATIO_MAX):
|
|
bad_ratio.append(flight.flight_id)
|
|
missing = sorted(set(DEFENSE_LAYERS) - flight.defense_layers_present)
|
|
if missing:
|
|
missing_layers.append((flight.flight_id, missing))
|
|
if flight.has_unknown_rejection_reasons():
|
|
unknown_reasons.append(flight.flight_id)
|
|
return CachePoisoningReport(
|
|
flights=tuple(flights),
|
|
flight_count=len(flights),
|
|
total_false_trust=total_false_trust,
|
|
flights_with_bad_poison_ratio=tuple(bad_ratio),
|
|
flights_missing_defense_layers=tuple(missing_layers),
|
|
flights_with_unknown_rejection_reasons=tuple(unknown_reasons),
|
|
)
|
|
|
|
|
|
def write_csv_evidence(out_path: Path, report: CachePoisoningReport) -> Path:
|
|
"""Per-flight CSV — one row per flight + an aggregate footer row."""
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with out_path.open("w", newline="") as fh:
|
|
writer = csv.writer(fh)
|
|
writer.writerow(
|
|
[
|
|
"flight_id",
|
|
"total_tile_count",
|
|
"poisoned_tile_count",
|
|
"poison_ratio",
|
|
"defense_layers_present",
|
|
"false_trust_count",
|
|
"rejection_reason_breakdown",
|
|
]
|
|
)
|
|
for flight in report.flights:
|
|
layers_present = ",".join(sorted(flight.defense_layers_present)) or "<none>"
|
|
breakdown = ";".join(
|
|
f"{reason}={count}"
|
|
for reason, count in sorted(flight.rejection_reasons.items())
|
|
)
|
|
writer.writerow(
|
|
[
|
|
flight.flight_id,
|
|
flight.total_tile_count,
|
|
len(flight.poisoned_tiles),
|
|
f"{flight.poison_ratio:.4f}",
|
|
layers_present,
|
|
flight.false_trust_count,
|
|
breakdown,
|
|
]
|
|
)
|
|
writer.writerow([])
|
|
writer.writerow(
|
|
[
|
|
"AGGREGATE",
|
|
f"flight_count={report.flight_count}",
|
|
f"total_false_trust={report.total_false_trust}",
|
|
f"budget={report.budget:g}",
|
|
f"passes_budget={'true' if report.passes_budget else 'false'}",
|
|
f"passes_ratio={'true' if report.passes_ratio else 'false'}",
|
|
f"passes_layer_coverage={'true' if report.passes_layer_coverage else 'false'}",
|
|
]
|
|
)
|
|
return out_path
|