feat(testing): per-frame JSONL trace in E2EHarness

Opt-in trace_path parameter dumps one JSON record per processed frame
with the fields diagnostics need:

  frame_idx, timestamp_ns, vo_success, alignment_success,
  tracking_state, confidence,
  eskf_initialized, eskf_position_enu (or None), eskf_pos_sigma_m,
  estimate_lat/lon, gt_lat/lon/alt

No perf cost when trace_path is None. File is rotated per run — safe to
point at /tmp/foo.jsonl for ad-hoc debugging.

First real run on EuRoC MH_01 (100 frames) immediately exposes the
concrete divergence: vo_success=0/100 (VO never engages on EuRoC
grayscale imagery with current SP+LG adapter), eskf_initialized=0/100,
alignment_success=77/100 (satellite-fallback path fires). Diagnosis
that was hidden behind a single "ATE=10.9 km" number is now machine-
readable per frame.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yuzviak
2026-04-18 14:24:02 +03:00
committed by Maksym Yuzviak
parent 1bf8b2a684
commit a05381ade2
2 changed files with 141 additions and 10 deletions
+75 -2
View File
@@ -13,7 +13,9 @@ What the harness does NOT do:
from __future__ import annotations from __future__ import annotations
import json
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional from typing import Optional
from unittest.mock import AsyncMock, MagicMock from unittest.mock import AsyncMock, MagicMock
@@ -29,7 +31,11 @@ from gps_denied.core.processor import FlightProcessor
from gps_denied.core.recovery import FailureRecoveryCoordinator from gps_denied.core.recovery import FailureRecoveryCoordinator
from gps_denied.core.vo import SequentialVisualOdometry from gps_denied.core.vo import SequentialVisualOdometry
from gps_denied.schemas.graph import FactorGraphConfig from gps_denied.schemas.graph import FactorGraphConfig
from gps_denied.testing.datasets.base import DatasetAdapter, PlatformClass from gps_denied.testing.datasets.base import (
DatasetAdapter,
DatasetPose,
PlatformClass,
)
EARTH_R = 6_378_137.0 EARTH_R = 6_378_137.0
@@ -59,10 +65,12 @@ class E2EHarness:
adapter: DatasetAdapter, adapter: DatasetAdapter,
flight_id: str = "e2e-flight", flight_id: str = "e2e-flight",
max_frames: Optional[int] = None, max_frames: Optional[int] = None,
trace_path: Optional[Path] = None,
) -> None: ) -> None:
self._adapter = adapter self._adapter = adapter
self._flight_id = flight_id self._flight_id = flight_id
self._max_frames = max_frames self._max_frames = max_frames
self._trace_path: Optional[Path] = Path(trace_path) if trace_path else None
self._estimates: list[tuple[int, Optional[tuple[float, float, float]]]] = [] self._estimates: list[tuple[int, Optional[tuple[float, float, float]]]] = []
async def run(self) -> HarnessResult: async def run(self) -> HarnessResult:
@@ -74,6 +82,18 @@ class E2EHarness:
frames = frames[: self._max_frames] frames = frames[: self._max_frames]
gt_poses = gt_poses[: self._max_frames] gt_poses = gt_poses[: self._max_frames]
# Align GT by index so trace records can pair each frame with the
# corresponding pose without an expensive timestamp search.
gt_by_idx: dict[int, DatasetPose] = {}
for i, pose in enumerate(gt_poses):
gt_by_idx[i] = pose
trace_fh = None
if self._trace_path is not None:
self._trace_path.parent.mkdir(parents=True, exist_ok=True)
trace_fh = self._trace_path.open("w")
try:
for frame in frames: for frame in frames:
image = self._load_or_synth_image(frame.image_path) image = self._load_or_synth_image(frame.image_path)
result = await processor.process_frame( result = await processor.process_frame(
@@ -81,9 +101,17 @@ class E2EHarness:
) )
est = None est = None
if result.gps is not None: if result.gps is not None:
est = (result.gps.lat, result.gps.lon, 0.0) # alt not returned here est = (result.gps.lat, result.gps.lon, 0.0)
self._estimates.append((frame.frame_idx, est)) self._estimates.append((frame.frame_idx, est))
if trace_fh is not None:
gt = gt_by_idx.get(frame.frame_idx)
record = self._trace_record(processor, frame, result, gt)
trace_fh.write(json.dumps(record) + "\n")
finally:
if trace_fh is not None:
trace_fh.close()
gt_enu = self._poses_to_enu(gt_poses) gt_enu = self._poses_to_enu(gt_poses)
est_enu = self._estimates_to_enu(gt_poses[0] if gt_poses else None) est_enu = self._estimates_to_enu(gt_poses[0] if gt_poses else None)
@@ -96,6 +124,51 @@ class E2EHarness:
platform_class=self._adapter.capabilities.platform_class, platform_class=self._adapter.capabilities.platform_class,
) )
def _trace_record(
self,
processor: FlightProcessor,
frame,
result,
gt: Optional[DatasetPose],
) -> dict:
"""Build one JSONL record describing the product's state after a frame.
Captures VO success, ESKF state (position + trace(cov) as a scalar
sigma), and the estimate/GT pair. Enough to diagnose *where* the
pipeline diverges without dumping raw images or covariance matrices.
"""
eskf = processor._eskf.get(self._flight_id) # noqa: SLF001 — test harness
eskf_initialized = bool(eskf and eskf.initialized)
eskf_position = None
eskf_sigma = None
if eskf_initialized:
pos = eskf.position
eskf_position = [float(pos[0]), float(pos[1]), float(pos[2])]
cov = eskf.covariance
if cov is not None:
# Scalar horizontal+vertical position uncertainty summary.
eskf_sigma = float(np.sqrt(np.trace(cov[0:3, 0:3]) / 3.0))
est_lat = result.gps.lat if result.gps is not None else None
est_lon = result.gps.lon if result.gps is not None else None
return {
"frame_idx": frame.frame_idx,
"timestamp_ns": frame.timestamp_ns,
"vo_success": bool(result.vo_success),
"alignment_success": bool(result.alignment_success),
"tracking_state": result.tracking_state.value,
"confidence": float(result.confidence),
"eskf_initialized": eskf_initialized,
"eskf_position_enu": eskf_position,
"eskf_pos_sigma_m": eskf_sigma,
"estimate_lat": est_lat,
"estimate_lon": est_lon,
"gt_lat": gt.lat if gt else None,
"gt_lon": gt.lon if gt else None,
"gt_alt": gt.alt if gt else None,
}
def _build_processor(self) -> FlightProcessor: def _build_processor(self) -> FlightProcessor:
repo = MagicMock() repo = MagicMock()
streamer = MagicMock() streamer = MagicMock()
+58
View File
@@ -64,3 +64,61 @@ async def test_harness_max_frames_larger_than_dataset_is_harmless():
harness = E2EHarness(adapter, max_frames=100) harness = E2EHarness(adapter, max_frames=100)
result = await harness.run() result = await harness.run()
assert result.num_frames_submitted == 4 assert result.num_frames_submitted == 4
@pytest.mark.asyncio
async def test_harness_trace_path_none_does_not_emit(tmp_path):
# No trace_path → no file created, no perf cost from serialisation.
adapter = SyntheticAdapter(num_frames=3, fps=5.0)
harness = E2EHarness(adapter)
await harness.run()
# Sanity: tmp_path stays empty (harness never touches it).
assert list(tmp_path.iterdir()) == []
@pytest.mark.asyncio
async def test_harness_trace_path_writes_jsonl_with_expected_fields(tmp_path):
import json
trace = tmp_path / "run.jsonl"
adapter = SyntheticAdapter(num_frames=4, fps=5.0)
harness = E2EHarness(adapter, trace_path=trace)
await harness.run()
assert trace.is_file()
lines = trace.read_text().splitlines()
# One JSON record per submitted frame
assert len(lines) == 4
first = json.loads(lines[0])
# Contract: every record carries these keys so downstream tooling can
# depend on them across datasets.
expected_keys = {
"frame_idx",
"timestamp_ns",
"vo_success",
"alignment_success",
"tracking_state",
"confidence",
"eskf_initialized",
"eskf_position_enu", # list[3] or None when not initialized
"eskf_pos_sigma_m", # float or None
"estimate_lat", # float or None
"estimate_lon",
"gt_lat",
"gt_lon",
"gt_alt",
}
missing = expected_keys - set(first.keys())
assert not missing, f"trace record missing keys: {missing}"
@pytest.mark.asyncio
async def test_harness_trace_truncates_with_max_frames(tmp_path):
# When max_frames caps iteration, trace file has exactly that many lines.
trace = tmp_path / "run.jsonl"
adapter = SyntheticAdapter(num_frames=10, fps=5.0)
harness = E2EHarness(adapter, max_frames=3, trace_path=trace)
await harness.run()
lines = trace.read_text().splitlines()
assert len(lines) == 3