[AZ-234] [AZ-235] [AZ-236] [AZ-237] Add replay tests

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 16:31:14 +00:00 · 2026-05-05 06:24:10 +03:00
parent c30fd4f67d
commit 5acd14b792
12 changed files with 616 additions and 3 deletions
@@ -9,6 +9,7 @@ from __future__ import annotations
 import argparse
 import csv
 import json
+import math
 import os
 from dataclasses import dataclass, field
 from enum import Enum
@@ -65,6 +66,24 @@ class RecordedInteraction:
    response: Mapping[str, str | bool]


+@dataclass(frozen=True)
+class ExpectedCoordinate:
+    image_ref: str
+    latitude_deg: float
+    longitude_deg: float
+
+
+@dataclass(frozen=True)
+class ReplayEstimate:
+    image_ref: str
+    latitude_deg: float
+    longitude_deg: float
+    covariance_95_semi_major_m: float
+    source_label: str
+    anchor_age_ms: int
+    capture_to_output_latency_ms: float
+
+
@dataclass(frozen=True)
 class ScenarioReport:
    scenario_id: str
@@ -80,6 +99,7 @@ class ScenarioReport:
    error_message: str
    artifacts: tuple[Path, ...]
    interactions: tuple[RecordedInteraction, ...]
+    metrics: Mapping[str, float | str | bool] = field(default_factory=dict)


@dataclass(frozen=True)
@@ -133,6 +153,9 @@ class SatelliteCacheStub(DeterministicStub):
            {
                "variant": variant,
                "trusted": trusted,
+                "freshness_status": "fresh" if trusted else "rejected",
+                "fixture_size_bytes": "1048576",
+                "storage_budget_bytes": "10737418240",
                "network_fetch_attempted": False,
                "provenance": "offline-fixture",
            },
@@ -374,10 +397,142 @@ def default_scenarios() -> tuple[ScenarioConfig, ...]:
    )


+def load_expected_coordinates(coordinates_path: Path) -> tuple[ExpectedCoordinate, ...]:
+    rows: list[ExpectedCoordinate] = []
+    with coordinates_path.open(encoding="utf-8", newline="") as coordinates_file:
+        reader = csv.DictReader(coordinates_file)
+        for row in reader:
+            normalized_row = {key.strip(): value for key, value in row.items() if key is not None}
+            image_ref = (normalized_row.get("image") or "").strip()
+            latitude = float((normalized_row.get("lat") or "").strip())
+            longitude = float((normalized_row.get("lon") or "").strip())
+            if not image_ref:
+                raise ValueError("expected coordinate row is missing image reference")
+            if not -90.0 <= latitude <= 90.0 or not -180.0 <= longitude <= 180.0:
+                raise ValueError(f"expected coordinate row is outside WGS84 bounds: {image_ref}")
+            rows.append(
+                ExpectedCoordinate(
+                    image_ref=image_ref,
+                    latitude_deg=latitude,
+                    longitude_deg=longitude,
+                )
+            )
+    if not rows:
+        raise ValueError("expected coordinate fixture is empty")
+    return tuple(rows)
+
+
+def evaluate_still_image_estimates(
+    expected_coordinates: Sequence[ExpectedCoordinate],
+    estimates: Sequence[ReplayEstimate],
+) -> Mapping[str, float | str | bool]:
+    expected_by_image = {coordinate.image_ref: coordinate for coordinate in expected_coordinates}
+    if len(estimates) != len(expected_by_image):
+        raise ValueError("replay estimate count does not match expected coordinate count")
+
+    distances = []
+    latencies = []
+    for estimate in estimates:
+        expected = expected_by_image.get(estimate.image_ref)
+        if expected is None:
+            raise ValueError(f"unexpected estimate image reference: {estimate.image_ref}")
+        _require_confidence_fields(estimate)
+        distances.append(
+            haversine_m(
+                expected.latitude_deg,
+                expected.longitude_deg,
+                estimate.latitude_deg,
+                estimate.longitude_deg,
+            )
+        )
+        latencies.append(estimate.capture_to_output_latency_ms)
+
+    within_50_m = sum(distance <= 50.0 for distance in distances) / len(distances)
+    within_20_m = sum(distance <= 20.0 for distance in distances) / len(distances)
+    return {
+        "frames_processed": float(len(estimates)),
+        "within_50_m_rate": within_50_m,
+        "within_20_m_rate": within_20_m,
+        "p50_latency_ms": percentile(latencies, 50),
+        "p95_latency_ms": percentile(latencies, 95),
+        "p99_latency_ms": percentile(latencies, 99),
+        "dropped_frame_rate": 0.0,
+        "threshold_passed": within_50_m >= 0.80 and within_20_m >= 0.50,
+    }
+
+
+def validate_derkachi_alignment(
+    video_duration_s: float,
+    telemetry_duration_s: float,
+    telemetry_rows: int,
+    frame_rate_hz: float = 30.0,
+) -> Mapping[str, float | str | bool]:
+    duration_delta_s = abs(video_duration_s - telemetry_duration_s)
+    if duration_delta_s > 0.250:
+        raise ValueError("Derkachi video and telemetry durations differ by more than 250 ms")
+    if telemetry_rows <= 0:
+        raise ValueError("Derkachi telemetry fixture is empty")
+
+    frame_count = round(video_duration_s * frame_rate_hz)
+    frames_per_telemetry = frame_count / telemetry_rows
+    if not math.isclose(frames_per_telemetry, 3.0, rel_tol=0.02, abs_tol=0.05):
+        raise ValueError("Derkachi replay must have approximately 3 video frames per telemetry row")
+
+    return {
+        "video_duration_s": video_duration_s,
+        "telemetry_duration_s": telemetry_duration_s,
+        "duration_delta_s": duration_delta_s,
+        "frames_per_telemetry": frames_per_telemetry,
+        "alignment_valid": True,
+    }
+
+
+def percentile(values: Sequence[float], percentile_value: int) -> float:
+    if not values:
+        raise ValueError("cannot compute percentile for empty values")
+    ordered = sorted(values)
+    index = min(
+        len(ordered) - 1,
+        max(0, math.ceil((percentile_value / 100.0) * len(ordered)) - 1),
+    )
+    return ordered[index]
+
+
+def mavlink_source_is_authorized(source_system_id: int, allowed_source_system_ids: set[int]) -> bool:
+    return source_system_id in allowed_source_system_ids
+
+
+def haversine_m(
+    latitude_a_deg: float,
+    longitude_a_deg: float,
+    latitude_b_deg: float,
+    longitude_b_deg: float,
+) -> float:
+    earth_radius_m = 6_371_000.0
+    latitude_a = math.radians(latitude_a_deg)
+    latitude_b = math.radians(latitude_b_deg)
+    delta_latitude = math.radians(latitude_b_deg - latitude_a_deg)
+    delta_longitude = math.radians(longitude_b_deg - longitude_a_deg)
+    haversine = (
+        math.sin(delta_latitude / 2.0) ** 2
+        + math.cos(latitude_a) * math.cos(latitude_b) * math.sin(delta_longitude / 2.0) ** 2
+    )
+    return 2.0 * earth_radius_m * math.asin(math.sqrt(haversine))
+
+
 def _optional_float(value: float | None) -> str:
    return "" if value is None else f"{value:.3f}"


+def _require_confidence_fields(estimate: ReplayEstimate) -> None:
+    if estimate.covariance_95_semi_major_m < 0.0:
+        raise ValueError(f"estimate covariance is invalid: {estimate.image_ref}")
+    if not estimate.source_label:
+        raise ValueError(f"estimate source label is missing: {estimate.image_ref}")
+    if estimate.anchor_age_ms < 0:
+        raise ValueError(f"estimate anchor age is invalid: {estimate.image_ref}")
+
+
 def main(argv: Sequence[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Run deterministic black-box replay scenarios.")
    parser.add_argument(