[AZ-414] [AZ-415] [AZ-418] Test batch 71: sharp turn + multi-segment + smoothing

- AZ-414 (FT-P-07 + FT-N-02): sharp_turn_detector helper covering
  AC-1 (gyro_z run detection + synthetic-overlay fallback),
  AC-2/AC-3 (FT-N-02 during-turn label + monotonic covariance),
  AC-4/AC-5/AC-6 (FT-P-07 recovery lag/drift/heading); twin scenario
  files under positive/ and negative/.
- AZ-415 (FT-P-08): multi_segment_evaluator helper + scenario.
- AZ-418 (FT-P-10): smoothing_evaluator helper covering AC-1 (raw +
  smoothed pose pairing), AC-2 (improvement rate >= 0.80), AC-3
  (mean improvement >= 5 m); scenario file.
- All scenarios skip-gated on upstream frame_source_replay /
  imu_replay / fdr_reader stubs (auto-activate when AZ-441 + AZ-407
  leftovers land).
- +68 unit tests; full e2e unit suite: 393 passed.

See _docs/03_implementation/batch_71_report.md and
_docs/03_implementation/reviews/batch_71_review.md.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-17 07:12:24 +03:00
parent 29ac16cfcb
commit c6e6cba237
17 changed files with 3195 additions and 1 deletions
+246
View File
@@ -0,0 +1,246 @@
"""GTSAM smoothing-loop look-back evaluation for FT-P-10 (AZ-418 / AC-4.5 revised).
The SUT exposes (per AC-NEW-3 FDR schema) two record entries per past
keyframe ``k``:
* ``raw_pose_k``: the single-shot pose at the keyframe's first emission.
* ``smoothed_pose_k``: the iSAM2-converged pose at smoother window exit.
This helper pairs them by keyframe id, computes per-keyframe geodesic
distance vs the Derkachi ``GLOBAL_POSITION_INT`` GT, and reports:
* AC-2: ``count(smoothed_error < raw_error) / total_keyframes ≥ 0.80``.
* AC-3: ``mean(raw_error smoothed_error) ≥ 5 m`` across all keyframes.
Mode B Fact #107 is reflected in the spec: this is an INTERNAL
improvement metric — NOT FC-side retroactive correction.
Public-boundary discipline: this module does NOT import any
``src/gps_denied_onboard`` symbol.
"""
from __future__ import annotations
import csv
from dataclasses import dataclass
from pathlib import Path
from statistics import mean
from typing import Iterable, Mapping, Sequence
from .geo import distance_m
IMPROVEMENT_RATE_REQUIRED = 0.80
MEAN_IMPROVEMENT_M_REQUIRED = 5.0
@dataclass(frozen=True)
class GtPose:
"""One Derkachi GLOBAL_POSITION_INT GT pose."""
monotonic_ms: int
lat_deg: float
lon_deg: float
@dataclass(frozen=True)
class KeyframePoseRecord:
"""One FDR pose record for a past keyframe.
The scenario builds these from the FDR archive, separating raw vs
smoothed by the FDR ``payload['pose_kind']`` field.
"""
keyframe_id: int
pose_kind: str # "raw" | "smoothed"
monotonic_ms: int
lat_deg: float
lon_deg: float
@dataclass(frozen=True)
class KeyframePair:
"""A matched raw + smoothed pair for one keyframe."""
keyframe_id: int
raw: KeyframePoseRecord
smoothed: KeyframePoseRecord
gt: GtPose
raw_error_m: float
smoothed_error_m: float
@property
def improvement_m(self) -> float:
return self.raw_error_m - self.smoothed_error_m
@property
def smoothed_wins(self) -> bool:
return self.smoothed_error_m < self.raw_error_m
@dataclass(frozen=True)
class SmoothingReport:
"""Aggregate report consumed by the scenario assertion."""
pairs: tuple[KeyframePair, ...]
improvement_rate: float
mean_improvement_m: float
median_improvement_m: float
rate_required: float = IMPROVEMENT_RATE_REQUIRED
mean_required: float = MEAN_IMPROVEMENT_M_REQUIRED
@property
def pair_count(self) -> int:
return len(self.pairs)
@property
def passes_rate(self) -> bool:
return self.pair_count > 0 and self.improvement_rate >= self.rate_required
@property
def passes_mean(self) -> bool:
return self.pair_count > 0 and self.mean_improvement_m >= self.mean_required
@property
def passes(self) -> bool:
return self.passes_rate and self.passes_mean
def pair_records(
records: Sequence[KeyframePoseRecord],
) -> dict[int, tuple[KeyframePoseRecord | None, KeyframePoseRecord | None]]:
"""Group records by ``keyframe_id``; return (raw, smoothed) pairs.
Duplicate ``pose_kind`` values for the same keyframe raise
``ValueError`` — the FDR schema MUST emit exactly one raw + one
smoothed per past keyframe.
"""
by_kf: dict[int, dict[str, KeyframePoseRecord]] = {}
for r in records:
if r.pose_kind not in ("raw", "smoothed"):
raise ValueError(
f"unknown pose_kind '{r.pose_kind}' for keyframe {r.keyframe_id}"
)
bucket = by_kf.setdefault(r.keyframe_id, {})
if r.pose_kind in bucket:
raise ValueError(
f"duplicate {r.pose_kind} pose for keyframe {r.keyframe_id}"
)
bucket[r.pose_kind] = r
return {
kf: (b.get("raw"), b.get("smoothed"))
for kf, b in by_kf.items()
}
def resolve_gt_at(monotonic_ms: int, gt_track: Sequence[GtPose]) -> GtPose:
"""Find the GT pose nearest to ``monotonic_ms`` (linear scan).
The Derkachi GT track is at 10 Hz (100 ms cadence); choosing the
nearest sample is < 50 ms off and acceptable for the AC-4.5
measurement which is about decimetre-to-metre improvement deltas.
"""
if not gt_track:
raise ValueError("gt_track is empty")
best = min(gt_track, key=lambda g: abs(g.monotonic_ms - monotonic_ms))
return best
def evaluate(
records: Sequence[KeyframePoseRecord],
gt_track: Sequence[GtPose],
) -> SmoothingReport:
"""Pair raw + smoothed by keyframe, compute errors, aggregate.
Keyframes missing either raw or smoothed are excluded from the
aggregate (with a docstring note); a future tightening could raise
on missing entries, but the spec text "if only one is present, the
test fails" applies at the scenario level — we surface it here via
the empty-pair count.
"""
if not gt_track:
raise ValueError("gt_track must not be empty")
paired = pair_records(records)
pairs: list[KeyframePair] = []
for kf, (raw, smoothed) in sorted(paired.items()):
if raw is None or smoothed is None:
continue
# Use the raw record's monotonic_ms to resolve GT — the keyframe's
# actual flight time, not the smoothing-window-exit time.
gt = resolve_gt_at(raw.monotonic_ms, gt_track)
raw_err = distance_m(gt.lat_deg, gt.lon_deg, raw.lat_deg, raw.lon_deg)
smoothed_err = distance_m(gt.lat_deg, gt.lon_deg, smoothed.lat_deg, smoothed.lon_deg)
pairs.append(
KeyframePair(
keyframe_id=kf,
raw=raw,
smoothed=smoothed,
gt=gt,
raw_error_m=raw_err,
smoothed_error_m=smoothed_err,
)
)
if pairs:
wins = sum(1 for p in pairs if p.smoothed_wins)
rate = wins / len(pairs)
improvements = [p.improvement_m for p in pairs]
mean_imp = mean(improvements)
sorted_imps = sorted(improvements)
n = len(sorted_imps)
median_imp = (
sorted_imps[n // 2] if n % 2 == 1
else (sorted_imps[n // 2 - 1] + sorted_imps[n // 2]) / 2.0
)
else:
rate = 0.0
mean_imp = 0.0
median_imp = 0.0
return SmoothingReport(
pairs=tuple(pairs),
improvement_rate=rate,
mean_improvement_m=mean_imp,
median_improvement_m=median_imp,
)
def write_csv_evidence(out_path: Path, report: SmoothingReport) -> Path:
"""Write FT-P-10 per-keyframe evidence CSV.
Header: ``keyframe_id, raw_lat, raw_lon, smoothed_lat, smoothed_lon,
gt_lat, gt_lon, raw_error_m, smoothed_error_m, improvement_m,
smoothed_wins``.
"""
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", newline="") as fh:
writer = csv.writer(fh)
writer.writerow(
[
"keyframe_id",
"raw_lat",
"raw_lon",
"smoothed_lat",
"smoothed_lon",
"gt_lat",
"gt_lon",
"raw_error_m",
"smoothed_error_m",
"improvement_m",
"smoothed_wins",
]
)
for p in report.pairs:
writer.writerow(
[
p.keyframe_id,
f"{p.raw.lat_deg:.6f}",
f"{p.raw.lon_deg:.6f}",
f"{p.smoothed.lat_deg:.6f}",
f"{p.smoothed.lon_deg:.6f}",
f"{p.gt.lat_deg:.6f}",
f"{p.gt.lon_deg:.6f}",
f"{p.raw_error_m:.3f}",
f"{p.smoothed_error_m:.3f}",
f"{p.improvement_m:.3f}",
"true" if p.smoothed_wins else "false",
]
)
return out_path