[AZ-414] [AZ-415] [AZ-418] Test batch 71: sharp turn + multi-segment + smoothing

- AZ-414 (FT-P-07 + FT-N-02): sharp_turn_detector helper covering AC-1 (gyro_z run detection + synthetic-overlay fallback), AC-2/AC-3 (FT-N-02 during-turn label + monotonic covariance), AC-4/AC-5/AC-6 (FT-P-07 recovery lag/drift/heading); twin scenario files under positive/ and negative/. - AZ-415 (FT-P-08): multi_segment_evaluator helper + scenario. - AZ-418 (FT-P-10): smoothing_evaluator helper covering AC-1 (raw + smoothed pose pairing), AC-2 (improvement rate >= 0.80), AC-3 (mean improvement >= 5 m); scenario file. - All scenarios skip-gated on upstream frame_source_replay / imu_replay / fdr_reader stubs (auto-activate when AZ-441 + AZ-407 leftovers land). - +68 unit tests; full e2e unit suite: 393 passed. See _docs/03_implementation/batch_71_report.md and _docs/03_implementation/reviews/batch_71_review.md. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 04:41:13 +00:00 · 2026-05-17 07:12:24 +03:00
parent 29ac16cfcb
commit c6e6cba237
17 changed files with 3195 additions and 1 deletions
@@ -0,0 +1,287 @@
+"""Multi-segment relocalisation evaluation for FT-P-08 (AZ-415 / AC-3.3).
+
+The ``multi-segment-derkachi`` fixture (AZ-408) writes a ``schedule.json``
+naming ≥3 disjoint blackout windows. During replay the SUT MUST:
+
+* AC-2: emit ``source_label = dead_reckoned`` for every estimate inside
+  every blackout window.
+* AC-3: emit the next ``source_label = satellite_anchored`` within
+  ≤3 frames of each blackout's ``end_ms`` (target frame cadence = 3 fps
+  per the runtime profile in `_docs/02_document/tests/blackbox-tests.md`).
+* AC-4: trajectory continuity — the geodesic distance between the last
+  pre-recovery estimate (at or before ``end_ms``) and the first
+  post-recovery anchor must be ≤100 m.
+
+The aggregate passes only when ALL ≥3 windows satisfy ALL three checks.
+
+Public-boundary discipline: this module does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Iterable, Mapping, Sequence
+
+from .geo import distance_m
+
+DEAD_RECKONED = "dead_reckoned"
+SATELLITE_ANCHORED = "satellite_anchored"
+VISUAL_PROPAGATED = "visual_propagated"
+
+ALLOWED_SOURCE_LABELS = {SATELLITE_ANCHORED, VISUAL_PROPAGATED, DEAD_RECKONED}
+
+# AC-3 / AC-4 / AC-5 thresholds from the FT-P-08 spec.
+MAX_RECOVERY_FRAMES = 3
+MAX_RECOVERY_FRAMES_SAFETY_MS = 1100  # 3 frames @ ~3 fps; +100 ms scheduling slack
+MAX_TRAJECTORY_JUMP_M = 100.0
+MIN_SEGMENTS_REQUIRED = 3
+
+
+@dataclass(frozen=True)
+class BlackoutWindow:
+    """One blackout window from the injector's ``schedule.json``."""
+
+    start_ms: int
+    end_ms: int
+    first_frame_idx: int
+    last_frame_idx: int
+
+    @property
+    def duration_ms(self) -> int:
+        return self.end_ms - self.start_ms
+
+
+@dataclass(frozen=True)
+class EstimateSample:
+    """One outbound estimate observed during replay.
+
+    The scenario builds this list from the SITL listener (for the
+    primary path) or from the post-run FDR archive (for the offline
+    audit). Either source is a public boundary.
+    """
+
+    monotonic_ms: int
+    lat_deg: float
+    lon_deg: float
+    source_label: str
+
+
+@dataclass(frozen=True)
+class PerWindowReport:
+    """Per-blackout-window evaluation produced by ``evaluate_window``."""
+
+    window_index: int
+    start_ms: int
+    end_ms: int
+    samples_inside: int
+    dead_reckoned_inside: int
+    label_violations: tuple[str, ...]
+    recovery_anchor_ms: int | None
+    recovery_lag_ms: int | None
+    trajectory_jump_m: float | None
+
+    @property
+    def passes_label(self) -> bool:
+        """AC-2: every inside-window sample is dead_reckoned."""
+        return (
+            self.samples_inside > 0
+            and self.dead_reckoned_inside == self.samples_inside
+            and not self.label_violations
+        )
+
+    @property
+    def passes_recovery(self) -> bool:
+        """AC-3: a satellite_anchored emission within the recovery budget."""
+        return (
+            self.recovery_lag_ms is not None
+            and self.recovery_lag_ms <= MAX_RECOVERY_FRAMES_SAFETY_MS
+        )
+
+    @property
+    def passes_jump(self) -> bool:
+        """AC-4: trajectory jump ≤100 m."""
+        return (
+            self.trajectory_jump_m is not None
+            and self.trajectory_jump_m <= MAX_TRAJECTORY_JUMP_M
+        )
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_label and self.passes_recovery and self.passes_jump
+
+
+@dataclass(frozen=True)
+class MultiSegmentReport:
+    """Aggregate report across all blackout windows; drives the scenario assertion."""
+
+    per_window: tuple[PerWindowReport, ...]
+    failed_windows: tuple[int, ...] = field(default_factory=tuple)
+
+    @property
+    def window_count(self) -> int:
+        return len(self.per_window)
+
+    @property
+    def passes_segment_count(self) -> bool:
+        return self.window_count >= MIN_SEGMENTS_REQUIRED
+
+    @property
+    def passes(self) -> bool:
+        return (
+            self.passes_segment_count
+            and all(w.passes for w in self.per_window)
+            and not self.failed_windows
+        )
+
+
+def load_schedule(schedule_json: Path) -> list[BlackoutWindow]:
+    """Read the multi_segment injector's ``schedule.json``.
+
+    Shape (per AZ-408 multi_segment._write_schedule):
+        {"segments": [{"start_ms": int, "end_ms": int,
+                       "first_frame_idx": int, "last_frame_idx": int}, ...]}
+    """
+    if not schedule_json.exists():
+        raise FileNotFoundError(
+            f"multi-segment schedule.json not found at {schedule_json} — "
+            "build the multi-segment-derkachi fixture first"
+        )
+    payload = json.loads(schedule_json.read_text())
+    if "segments" not in payload:
+        raise ValueError(
+            f"schedule.json missing 'segments' key — found {list(payload)}"
+        )
+    windows: list[BlackoutWindow] = []
+    for seg in payload["segments"]:
+        windows.append(
+            BlackoutWindow(
+                start_ms=int(seg["start_ms"]),
+                end_ms=int(seg["end_ms"]),
+                first_frame_idx=int(seg["first_frame_idx"]),
+                last_frame_idx=int(seg["last_frame_idx"]),
+            )
+        )
+    return windows
+
+
+def evaluate_window(
+    window: BlackoutWindow,
+    window_index: int,
+    samples: Sequence[EstimateSample],
+) -> PerWindowReport:
+    """Evaluate AC-2 / AC-3 / AC-4 for one blackout window.
+
+    Sample-window classification (inclusive of ``start_ms``, exclusive of
+    ``end_ms``) — the recovery search starts at ``end_ms`` and looks
+    forward.
+    """
+    inside = [s for s in samples if window.start_ms <= s.monotonic_ms < window.end_ms]
+    dead_reckoned_inside = sum(1 for s in inside if s.source_label == DEAD_RECKONED)
+    label_violations = tuple(
+        sorted({s.source_label for s in inside if s.source_label != DEAD_RECKONED})
+    )
+
+    # AC-3 recovery search: first satellite_anchored emission at or after end_ms.
+    recovery: EstimateSample | None = None
+    for s in samples:
+        if s.monotonic_ms >= window.end_ms and s.source_label == SATELLITE_ANCHORED:
+            recovery = s
+            break
+
+    # AC-4 trajectory jump: last estimate at or before end_ms vs the recovery anchor.
+    pre_recovery: EstimateSample | None = None
+    for s in samples:
+        if s.monotonic_ms < window.end_ms:
+            pre_recovery = s
+        else:
+            break
+
+    if recovery is not None and pre_recovery is not None:
+        jump_m: float | None = distance_m(
+            pre_recovery.lat_deg,
+            pre_recovery.lon_deg,
+            recovery.lat_deg,
+            recovery.lon_deg,
+        )
+    else:
+        jump_m = None
+
+    return PerWindowReport(
+        window_index=window_index,
+        start_ms=window.start_ms,
+        end_ms=window.end_ms,
+        samples_inside=len(inside),
+        dead_reckoned_inside=dead_reckoned_inside,
+        label_violations=label_violations,
+        recovery_anchor_ms=recovery.monotonic_ms if recovery is not None else None,
+        recovery_lag_ms=(recovery.monotonic_ms - window.end_ms) if recovery is not None else None,
+        trajectory_jump_m=jump_m,
+    )
+
+
+def evaluate(
+    windows: Sequence[BlackoutWindow],
+    samples: Sequence[EstimateSample],
+) -> MultiSegmentReport:
+    """Evaluate every window; aggregate per AC-1 + AC-2 + AC-3 + AC-4."""
+    for s in samples:
+        if s.source_label not in ALLOWED_SOURCE_LABELS:
+            raise ValueError(
+                f"unknown source_label '{s.source_label}' at {s.monotonic_ms} ms — "
+                f"allowed: {sorted(ALLOWED_SOURCE_LABELS)}"
+            )
+    per_window = tuple(
+        evaluate_window(w, i, samples) for i, w in enumerate(windows)
+    )
+    failed = tuple(w.window_index for w in per_window if not w.passes)
+    return MultiSegmentReport(per_window=per_window, failed_windows=failed)
+
+
+def write_csv_evidence(out_path: Path, report: MultiSegmentReport) -> Path:
+    """Write FT-P-08 per-window evidence CSV.
+
+    Header: ``window_index, start_ms, end_ms, samples_inside,
+    dead_reckoned_inside, label_violations, recovery_lag_ms,
+    trajectory_jump_m, passes_label, passes_recovery, passes_jump, passes``.
+    """
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "window_index",
+                "start_ms",
+                "end_ms",
+                "samples_inside",
+                "dead_reckoned_inside",
+                "label_violations",
+                "recovery_lag_ms",
+                "trajectory_jump_m",
+                "passes_label",
+                "passes_recovery",
+                "passes_jump",
+                "passes",
+            ]
+        )
+        for w in report.per_window:
+            writer.writerow(
+                [
+                    w.window_index,
+                    w.start_ms,
+                    w.end_ms,
+                    w.samples_inside,
+                    w.dead_reckoned_inside,
+                    "|".join(w.label_violations) if w.label_violations else "",
+                    "" if w.recovery_lag_ms is None else w.recovery_lag_ms,
+                    "" if w.trajectory_jump_m is None else f"{w.trajectory_jump_m:.3f}",
+                    "true" if w.passes_label else "false",
+                    "true" if w.passes_recovery else "false",
+                    "true" if w.passes_jump else "false",
+                    "true" if w.passes else "false",
+                ]
+            )
+    return out_path
@@ -0,0 +1,483 @@
+"""Sharp-turn detection + FT-P-07 / FT-N-02 evaluation (AZ-414 / AC-3.2).
+
+The Derkachi telemetry CSV records ``SCALED_IMU2.{xgyro, ygyro, zgyro}``
+in milli-degree/s. A "sharp turn" segment is identified by sustained
+high yaw rate (``|zgyro|``) — ≥``MIN_RUN_LENGTH`` consecutive IMU rows
+above ``DEFAULT_GYRO_Z_THRESHOLD_MDPS``. The threshold is read from
+``AC32_SHARP_TURN_GYRO_Z_MDPS`` env var if set (AZ-414 spec note —
+"reads from the test-spec environment, not from a hardcoded constant").
+
+FT-N-02 (during turn):
+* AC-2: source_label ∈ {visual_propagated, dead_reckoned}.
+* AC-3: cov_semi_major_m non-decreasing across consecutive frames inside
+  the turn.
+
+FT-P-07 (recovery):
+* AC-4: next satellite_anchored emission within ≤3 frames after turn end.
+* AC-5: ``|propagated_centre_at_turn_end − recovery_anchor_centre| ≤ 200 m``.
+* AC-6: heading delta from pre-turn anchor to post-turn anchor handled
+  in the [0°, 70°] envelope (recovery still occurs within the 3-frame budget).
+
+Synthetic-overlay fallback: if no natural Derkachi segment meets the
+threshold, the FT-P-07 / FT-N-02 scenarios fall back to a synthetic
+gyro overlay. The fallback decision is recorded in the FDR / CSV
+``evidence_paths`` per the spec (this helper exposes a flag; the
+scenario writes the marker).
+
+Public-boundary discipline: this module does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+import math
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Sequence
+
+from .geo import distance_m
+
+DEFAULT_GYRO_Z_THRESHOLD_MDPS = 30_000  # 30 °/s sustained yaw
+MIN_RUN_LENGTH = 3  # consecutive IMU rows above threshold
+SHARP_TURN_ENV_VAR = "AC32_SHARP_TURN_GYRO_Z_MDPS"
+
+MAX_RECOVERY_FRAMES = 3
+MAX_RECOVERY_FRAMES_SAFETY_MS = 1100  # 3 frames @ ~3 fps, +100 ms slack
+MAX_RECOVERY_DRIFT_M = 200.0
+MAX_HEADING_CHANGE_DEG = 70.0
+
+ALLOWED_DURING_TURN_LABELS = {"visual_propagated", "dead_reckoned"}
+
+
+@dataclass(frozen=True)
+class GyroSample:
+    """One IMU row's yaw-rate sample (z axis) — millidegree/s."""
+
+    monotonic_ms: int
+    time_s: float
+    zgyro_mdps: int
+
+
+@dataclass(frozen=True)
+class TurnSegment:
+    """A contiguous run of ≥MIN_RUN_LENGTH samples above the gyro threshold."""
+
+    start_index: int
+    end_index: int  # inclusive — the last sample still above threshold
+    start_ms: int
+    end_ms: int
+    peak_abs_zgyro_mdps: int
+    sample_count: int
+
+    @property
+    def duration_ms(self) -> int:
+        return self.end_ms - self.start_ms
+
+
+@dataclass(frozen=True)
+class TurnDetection:
+    """Per-flight detection: zero or more segments + synthetic-overlay flag."""
+
+    segments: tuple[TurnSegment, ...]
+    threshold_mdps: int
+    synthetic_overlay: bool = False
+
+    @property
+    def has_natural_turn(self) -> bool:
+        return len(self.segments) > 0 and not self.synthetic_overlay
+
+
+@dataclass(frozen=True)
+class TurnFrameSample:
+    """One outbound estimate observed during replay.
+
+    Same shape as ``multi_segment_evaluator.EstimateSample`` but with
+    the additional ``cov_semi_major_m`` channel FT-N-02 AC-3 needs.
+    """
+
+    monotonic_ms: int
+    lat_deg: float
+    lon_deg: float
+    source_label: str
+    cov_semi_major_m: float
+
+
+@dataclass(frozen=True)
+class FtN02WindowReport:
+    """FT-N-02 per-window report (during turn)."""
+
+    segment_index: int
+    samples_inside: int
+    label_violations: tuple[str, ...]
+    cov_non_decreasing: bool
+    first_decreasing_at_ms: int | None
+
+    @property
+    def passes_label(self) -> bool:
+        return self.samples_inside > 0 and not self.label_violations
+
+    @property
+    def passes_cov(self) -> bool:
+        return self.cov_non_decreasing
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_label and self.passes_cov
+
+
+@dataclass(frozen=True)
+class FtP07WindowReport:
+    """FT-P-07 per-window report (recovery)."""
+
+    segment_index: int
+    recovery_anchor_ms: int | None
+    recovery_lag_ms: int | None
+    drift_m: float | None
+    heading_change_deg: float | None
+    in_heading_envelope: bool
+
+    @property
+    def passes_recovery_lag(self) -> bool:
+        return (
+            self.recovery_lag_ms is not None
+            and self.recovery_lag_ms <= MAX_RECOVERY_FRAMES_SAFETY_MS
+        )
+
+    @property
+    def passes_drift(self) -> bool:
+        return self.drift_m is not None and self.drift_m <= MAX_RECOVERY_DRIFT_M
+
+    @property
+    def passes_heading(self) -> bool:
+        return self.in_heading_envelope
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_recovery_lag and self.passes_drift and self.passes_heading
+
+
+def get_threshold_mdps() -> int:
+    """Read the AC-3.2 threshold from env, defaulting to the project value."""
+    raw = os.environ.get(SHARP_TURN_ENV_VAR)
+    if raw is None or raw.strip() == "":
+        return DEFAULT_GYRO_Z_THRESHOLD_MDPS
+    try:
+        value = int(raw)
+    except ValueError:
+        raise ValueError(
+            f"{SHARP_TURN_ENV_VAR}={raw!r} is not a valid int (millidegree/s)"
+        )
+    if value <= 0:
+        raise ValueError(f"{SHARP_TURN_ENV_VAR}={raw!r} must be > 0")
+    return value
+
+
+def load_zgyro_samples(csv_path: Path) -> list[GyroSample]:
+    """Read ``data_imu.csv`` and return per-row yaw-rate samples."""
+    if not csv_path.exists():
+        raise FileNotFoundError(
+            f"data_imu.csv not found at {csv_path} — bind-mount the Derkachi fixture"
+        )
+    samples: list[GyroSample] = []
+    with csv_path.open() as fh:
+        reader = csv.DictReader(fh)
+        if "SCALED_IMU2.zgyro" not in (reader.fieldnames or []):
+            raise ValueError(
+                "data_imu.csv missing required column SCALED_IMU2.zgyro"
+            )
+        for row in reader:
+            ts = row.get("timestamp(ms)", "").strip()
+            if not ts:
+                continue
+            samples.append(
+                GyroSample(
+                    monotonic_ms=int(float(ts)),
+                    time_s=float(row["Time"]),
+                    zgyro_mdps=int(float(row["SCALED_IMU2.zgyro"])),
+                )
+            )
+    return samples
+
+
+def detect_turn_segments(
+    samples: Sequence[GyroSample],
+    *,
+    threshold_mdps: int | None = None,
+    min_run_length: int = MIN_RUN_LENGTH,
+) -> TurnDetection:
+    """Find contiguous runs of |zgyro| ≥ threshold lasting ≥min_run_length samples."""
+    if min_run_length < 1:
+        raise ValueError(f"min_run_length must be ≥1, got {min_run_length}")
+    threshold = threshold_mdps if threshold_mdps is not None else get_threshold_mdps()
+
+    segments: list[TurnSegment] = []
+    run_start: int | None = None
+    run_peak = 0
+    for i, s in enumerate(samples):
+        abs_z = abs(s.zgyro_mdps)
+        if abs_z >= threshold:
+            if run_start is None:
+                run_start = i
+                run_peak = abs_z
+            else:
+                run_peak = max(run_peak, abs_z)
+        else:
+            if run_start is not None:
+                run_len = i - run_start
+                if run_len >= min_run_length:
+                    seg_end = i - 1
+                    segments.append(
+                        TurnSegment(
+                            start_index=run_start,
+                            end_index=seg_end,
+                            start_ms=samples[run_start].monotonic_ms,
+                            end_ms=samples[seg_end].monotonic_ms,
+                            peak_abs_zgyro_mdps=run_peak,
+                            sample_count=run_len,
+                        )
+                    )
+                run_start = None
+                run_peak = 0
+    # Tail run.
+    if run_start is not None:
+        run_len = len(samples) - run_start
+        if run_len >= min_run_length:
+            seg_end = len(samples) - 1
+            segments.append(
+                TurnSegment(
+                    start_index=run_start,
+                    end_index=seg_end,
+                    start_ms=samples[run_start].monotonic_ms,
+                    end_ms=samples[seg_end].monotonic_ms,
+                    peak_abs_zgyro_mdps=run_peak,
+                    sample_count=run_len,
+                )
+            )
+
+    return TurnDetection(
+        segments=tuple(segments),
+        threshold_mdps=threshold,
+        synthetic_overlay=False,
+    )
+
+
+def synthesize_overlay_segment(
+    samples: Sequence[GyroSample],
+    *,
+    threshold_mdps: int,
+    anchor_fraction: float = 0.5,
+    duration_samples: int = MIN_RUN_LENGTH * 2,
+) -> TurnDetection:
+    """Construct a synthetic-overlay turn anchored at ``anchor_fraction`` of the flight.
+
+    The scenario only needs the segment's time bounds to evaluate the
+    SUT's behaviour — the overlay does NOT modify the IMU stream
+    (that's an injector concern). The detection result is marked
+    ``synthetic_overlay=True`` so the scenario writes the flag into
+    the evidence CSV.
+    """
+    if not samples:
+        raise ValueError("samples must not be empty")
+    if not 0.0 <= anchor_fraction <= 1.0:
+        raise ValueError(f"anchor_fraction must be in [0, 1], got {anchor_fraction}")
+    if duration_samples < MIN_RUN_LENGTH:
+        raise ValueError(
+            f"duration_samples must be ≥{MIN_RUN_LENGTH}, got {duration_samples}"
+        )
+    anchor_idx = int(anchor_fraction * (len(samples) - 1))
+    end_idx = min(len(samples) - 1, anchor_idx + duration_samples - 1)
+    seg = TurnSegment(
+        start_index=anchor_idx,
+        end_index=end_idx,
+        start_ms=samples[anchor_idx].monotonic_ms,
+        end_ms=samples[end_idx].monotonic_ms,
+        peak_abs_zgyro_mdps=threshold_mdps,  # synthesised; not natural
+        sample_count=end_idx - anchor_idx + 1,
+    )
+    return TurnDetection(
+        segments=(seg,),
+        threshold_mdps=threshold_mdps,
+        synthetic_overlay=True,
+    )
+
+
+def detect_or_synthesize(csv_path: Path) -> TurnDetection:
+    """Convenience: detect a natural turn; if none, synthesise an overlay.
+
+    The scenario uses this to keep the FT-P-07 / FT-N-02 paths covered
+    on the natural fixture even when the data doesn't naturally include
+    a sharp-turn segment.
+    """
+    samples = load_zgyro_samples(csv_path)
+    natural = detect_turn_segments(samples)
+    if natural.has_natural_turn:
+        return natural
+    return synthesize_overlay_segment(samples, threshold_mdps=natural.threshold_mdps)
+
+
+def _heading_change_deg(
+    pre_lat: float, pre_lon: float,
+    mid_lat: float, mid_lon: float,
+    post_lat: float, post_lon: float,
+) -> float:
+    """Heading delta from (pre→mid) to (mid→post), wrapped to [0, 180]."""
+    def bearing(a_lat: float, a_lon: float, b_lat: float, b_lon: float) -> float:
+        from .geo import delta
+        return delta(a_lat, a_lon, b_lat, b_lon).forward_bearing_deg
+
+    pre_bearing = bearing(pre_lat, pre_lon, mid_lat, mid_lon)
+    post_bearing = bearing(mid_lat, mid_lon, post_lat, post_lon)
+    diff = (post_bearing - pre_bearing) % 360.0
+    if diff > 180.0:
+        diff = 360.0 - diff
+    return diff
+
+
+def evaluate_ft_n_02(
+    segment: TurnSegment,
+    segment_index: int,
+    samples: Sequence[TurnFrameSample],
+) -> FtN02WindowReport:
+    """FT-N-02 per-window evaluation (during-turn label + monotonic covariance)."""
+    inside = [s for s in samples if segment.start_ms <= s.monotonic_ms <= segment.end_ms]
+    label_violations = tuple(
+        sorted({s.source_label for s in inside if s.source_label not in ALLOWED_DURING_TURN_LABELS})
+    )
+
+    cov_non_decreasing = True
+    first_decreasing: int | None = None
+    prev_cov = -math.inf
+    for s in inside:
+        if s.cov_semi_major_m < prev_cov:
+            cov_non_decreasing = False
+            first_decreasing = s.monotonic_ms
+            break
+        prev_cov = s.cov_semi_major_m
+
+    return FtN02WindowReport(
+        segment_index=segment_index,
+        samples_inside=len(inside),
+        label_violations=label_violations,
+        cov_non_decreasing=cov_non_decreasing,
+        first_decreasing_at_ms=first_decreasing,
+    )
+
+
+def evaluate_ft_p_07(
+    segment: TurnSegment,
+    segment_index: int,
+    samples: Sequence[TurnFrameSample],
+) -> FtP07WindowReport:
+    """FT-P-07 per-window evaluation (recovery anchor, drift, heading)."""
+    # Pre-turn anchor: last satellite_anchored at or before start_ms.
+    pre_anchor: TurnFrameSample | None = None
+    for s in samples:
+        if s.monotonic_ms <= segment.start_ms and s.source_label == "satellite_anchored":
+            pre_anchor = s
+        elif s.monotonic_ms > segment.start_ms:
+            break
+
+    # Last inside-window sample (propagated centre at turn end).
+    inside = [s for s in samples if segment.start_ms <= s.monotonic_ms <= segment.end_ms]
+    propagated_at_end = inside[-1] if inside else None
+
+    # Recovery: first satellite_anchored after end_ms.
+    recovery: TurnFrameSample | None = None
+    for s in samples:
+        if s.monotonic_ms > segment.end_ms and s.source_label == "satellite_anchored":
+            recovery = s
+            break
+
+    if recovery is None:
+        return FtP07WindowReport(
+            segment_index=segment_index,
+            recovery_anchor_ms=None,
+            recovery_lag_ms=None,
+            drift_m=None,
+            heading_change_deg=None,
+            in_heading_envelope=False,
+        )
+
+    recovery_lag = recovery.monotonic_ms - segment.end_ms
+    drift: float | None = None
+    if propagated_at_end is not None:
+        drift = distance_m(
+            propagated_at_end.lat_deg, propagated_at_end.lon_deg,
+            recovery.lat_deg, recovery.lon_deg,
+        )
+
+    heading_change: float | None = None
+    in_envelope = True
+    if pre_anchor is not None and propagated_at_end is not None:
+        heading_change = _heading_change_deg(
+            pre_anchor.lat_deg, pre_anchor.lon_deg,
+            propagated_at_end.lat_deg, propagated_at_end.lon_deg,
+            recovery.lat_deg, recovery.lon_deg,
+        )
+        in_envelope = heading_change <= MAX_HEADING_CHANGE_DEG
+
+    return FtP07WindowReport(
+        segment_index=segment_index,
+        recovery_anchor_ms=recovery.monotonic_ms,
+        recovery_lag_ms=recovery_lag,
+        drift_m=drift,
+        heading_change_deg=heading_change,
+        in_heading_envelope=in_envelope,
+    )
+
+
+def write_csv_evidence(
+    out_path: Path,
+    detection: TurnDetection,
+    n02_reports: Sequence[FtN02WindowReport],
+    p07_reports: Sequence[FtP07WindowReport],
+) -> Path:
+    """Write FT-P-07 + FT-N-02 combined evidence CSV.
+
+    The ``synthetic_overlay`` column marks the fallback case per AC-1.
+    """
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "segment_index",
+                "start_ms",
+                "end_ms",
+                "peak_abs_zgyro_mdps",
+                "synthetic_overlay",
+                # FT-N-02 columns
+                "samples_inside",
+                "label_violations",
+                "cov_non_decreasing",
+                # FT-P-07 columns
+                "recovery_lag_ms",
+                "drift_m",
+                "heading_change_deg",
+                "in_heading_envelope",
+                "passes_ft_n_02",
+                "passes_ft_p_07",
+            ]
+        )
+        for seg, n02, p07 in zip(detection.segments, n02_reports, p07_reports):
+            writer.writerow(
+                [
+                    n02.segment_index,
+                    seg.start_ms,
+                    seg.end_ms,
+                    seg.peak_abs_zgyro_mdps,
+                    "true" if detection.synthetic_overlay else "false",
+                    n02.samples_inside,
+                    "|".join(n02.label_violations) if n02.label_violations else "",
+                    "true" if n02.cov_non_decreasing else "false",
+                    "" if p07.recovery_lag_ms is None else p07.recovery_lag_ms,
+                    "" if p07.drift_m is None else f"{p07.drift_m:.3f}",
+                    "" if p07.heading_change_deg is None else f"{p07.heading_change_deg:.3f}",
+                    "true" if p07.in_heading_envelope else "false",
+                    "true" if n02.passes else "false",
+                    "true" if p07.passes else "false",
+                ]
+            )
+    return out_path
@@ -0,0 +1,246 @@
+"""GTSAM smoothing-loop look-back evaluation for FT-P-10 (AZ-418 / AC-4.5 revised).
+
+The SUT exposes (per AC-NEW-3 FDR schema) two record entries per past
+keyframe ``k``:
+
+* ``raw_pose_k``: the single-shot pose at the keyframe's first emission.
+* ``smoothed_pose_k``: the iSAM2-converged pose at smoother window exit.
+
+This helper pairs them by keyframe id, computes per-keyframe geodesic
+distance vs the Derkachi ``GLOBAL_POSITION_INT`` GT, and reports:
+
+* AC-2: ``count(smoothed_error < raw_error) / total_keyframes ≥ 0.80``.
+* AC-3: ``mean(raw_error − smoothed_error) ≥ 5 m`` across all keyframes.
+
+Mode B Fact #107 is reflected in the spec: this is an INTERNAL
+improvement metric — NOT FC-side retroactive correction.
+
+Public-boundary discipline: this module does NOT import any
+``src/gps_denied_onboard`` symbol.
+"""
+
+from __future__ import annotations
+
+import csv
+from dataclasses import dataclass
+from pathlib import Path
+from statistics import mean
+from typing import Iterable, Mapping, Sequence
+
+from .geo import distance_m
+
+IMPROVEMENT_RATE_REQUIRED = 0.80
+MEAN_IMPROVEMENT_M_REQUIRED = 5.0
+
+
+@dataclass(frozen=True)
+class GtPose:
+    """One Derkachi GLOBAL_POSITION_INT GT pose."""
+
+    monotonic_ms: int
+    lat_deg: float
+    lon_deg: float
+
+
+@dataclass(frozen=True)
+class KeyframePoseRecord:
+    """One FDR pose record for a past keyframe.
+
+    The scenario builds these from the FDR archive, separating raw vs
+    smoothed by the FDR ``payload['pose_kind']`` field.
+    """
+
+    keyframe_id: int
+    pose_kind: str  # "raw" | "smoothed"
+    monotonic_ms: int
+    lat_deg: float
+    lon_deg: float
+
+
+@dataclass(frozen=True)
+class KeyframePair:
+    """A matched raw + smoothed pair for one keyframe."""
+
+    keyframe_id: int
+    raw: KeyframePoseRecord
+    smoothed: KeyframePoseRecord
+    gt: GtPose
+    raw_error_m: float
+    smoothed_error_m: float
+
+    @property
+    def improvement_m(self) -> float:
+        return self.raw_error_m - self.smoothed_error_m
+
+    @property
+    def smoothed_wins(self) -> bool:
+        return self.smoothed_error_m < self.raw_error_m
+
+
+@dataclass(frozen=True)
+class SmoothingReport:
+    """Aggregate report consumed by the scenario assertion."""
+
+    pairs: tuple[KeyframePair, ...]
+    improvement_rate: float
+    mean_improvement_m: float
+    median_improvement_m: float
+    rate_required: float = IMPROVEMENT_RATE_REQUIRED
+    mean_required: float = MEAN_IMPROVEMENT_M_REQUIRED
+
+    @property
+    def pair_count(self) -> int:
+        return len(self.pairs)
+
+    @property
+    def passes_rate(self) -> bool:
+        return self.pair_count > 0 and self.improvement_rate >= self.rate_required
+
+    @property
+    def passes_mean(self) -> bool:
+        return self.pair_count > 0 and self.mean_improvement_m >= self.mean_required
+
+    @property
+    def passes(self) -> bool:
+        return self.passes_rate and self.passes_mean
+
+
+def pair_records(
+    records: Sequence[KeyframePoseRecord],
+) -> dict[int, tuple[KeyframePoseRecord | None, KeyframePoseRecord | None]]:
+    """Group records by ``keyframe_id``; return (raw, smoothed) pairs.
+
+    Duplicate ``pose_kind`` values for the same keyframe raise
+    ``ValueError`` — the FDR schema MUST emit exactly one raw + one
+    smoothed per past keyframe.
+    """
+    by_kf: dict[int, dict[str, KeyframePoseRecord]] = {}
+    for r in records:
+        if r.pose_kind not in ("raw", "smoothed"):
+            raise ValueError(
+                f"unknown pose_kind '{r.pose_kind}' for keyframe {r.keyframe_id}"
+            )
+        bucket = by_kf.setdefault(r.keyframe_id, {})
+        if r.pose_kind in bucket:
+            raise ValueError(
+                f"duplicate {r.pose_kind} pose for keyframe {r.keyframe_id}"
+            )
+        bucket[r.pose_kind] = r
+    return {
+        kf: (b.get("raw"), b.get("smoothed"))
+        for kf, b in by_kf.items()
+    }
+
+
+def resolve_gt_at(monotonic_ms: int, gt_track: Sequence[GtPose]) -> GtPose:
+    """Find the GT pose nearest to ``monotonic_ms`` (linear scan).
+
+    The Derkachi GT track is at 10 Hz (100 ms cadence); choosing the
+    nearest sample is < 50 ms off and acceptable for the AC-4.5
+    measurement which is about decimetre-to-metre improvement deltas.
+    """
+    if not gt_track:
+        raise ValueError("gt_track is empty")
+    best = min(gt_track, key=lambda g: abs(g.monotonic_ms - monotonic_ms))
+    return best
+
+
+def evaluate(
+    records: Sequence[KeyframePoseRecord],
+    gt_track: Sequence[GtPose],
+) -> SmoothingReport:
+    """Pair raw + smoothed by keyframe, compute errors, aggregate.
+
+    Keyframes missing either raw or smoothed are excluded from the
+    aggregate (with a docstring note); a future tightening could raise
+    on missing entries, but the spec text "if only one is present, the
+    test fails" applies at the scenario level — we surface it here via
+    the empty-pair count.
+    """
+    if not gt_track:
+        raise ValueError("gt_track must not be empty")
+    paired = pair_records(records)
+    pairs: list[KeyframePair] = []
+    for kf, (raw, smoothed) in sorted(paired.items()):
+        if raw is None or smoothed is None:
+            continue
+        # Use the raw record's monotonic_ms to resolve GT — the keyframe's
+        # actual flight time, not the smoothing-window-exit time.
+        gt = resolve_gt_at(raw.monotonic_ms, gt_track)
+        raw_err = distance_m(gt.lat_deg, gt.lon_deg, raw.lat_deg, raw.lon_deg)
+        smoothed_err = distance_m(gt.lat_deg, gt.lon_deg, smoothed.lat_deg, smoothed.lon_deg)
+        pairs.append(
+            KeyframePair(
+                keyframe_id=kf,
+                raw=raw,
+                smoothed=smoothed,
+                gt=gt,
+                raw_error_m=raw_err,
+                smoothed_error_m=smoothed_err,
+            )
+        )
+    if pairs:
+        wins = sum(1 for p in pairs if p.smoothed_wins)
+        rate = wins / len(pairs)
+        improvements = [p.improvement_m for p in pairs]
+        mean_imp = mean(improvements)
+        sorted_imps = sorted(improvements)
+        n = len(sorted_imps)
+        median_imp = (
+            sorted_imps[n // 2] if n % 2 == 1
+            else (sorted_imps[n // 2 - 1] + sorted_imps[n // 2]) / 2.0
+        )
+    else:
+        rate = 0.0
+        mean_imp = 0.0
+        median_imp = 0.0
+    return SmoothingReport(
+        pairs=tuple(pairs),
+        improvement_rate=rate,
+        mean_improvement_m=mean_imp,
+        median_improvement_m=median_imp,
+    )
+
+
+def write_csv_evidence(out_path: Path, report: SmoothingReport) -> Path:
+    """Write FT-P-10 per-keyframe evidence CSV.
+
+    Header: ``keyframe_id, raw_lat, raw_lon, smoothed_lat, smoothed_lon,
+    gt_lat, gt_lon, raw_error_m, smoothed_error_m, improvement_m,
+    smoothed_wins``.
+    """
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", newline="") as fh:
+        writer = csv.writer(fh)
+        writer.writerow(
+            [
+                "keyframe_id",
+                "raw_lat",
+                "raw_lon",
+                "smoothed_lat",
+                "smoothed_lon",
+                "gt_lat",
+                "gt_lon",
+                "raw_error_m",
+                "smoothed_error_m",
+                "improvement_m",
+                "smoothed_wins",
+            ]
+        )
+        for p in report.pairs:
+            writer.writerow(
+                [
+                    p.keyframe_id,
+                    f"{p.raw.lat_deg:.6f}",
+                    f"{p.raw.lon_deg:.6f}",
+                    f"{p.smoothed.lat_deg:.6f}",
+                    f"{p.smoothed.lon_deg:.6f}",
+                    f"{p.gt.lat_deg:.6f}",
+                    f"{p.gt.lon_deg:.6f}",
+                    f"{p.raw_error_m:.3f}",
+                    f"{p.smoothed_error_m:.3f}",
+                    f"{p.improvement_m:.3f}",
+                    "true" if p.smoothed_wins else "false",
+                ]
+            )
+    return out_path