mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 19:21:12 +00:00
29ac16cfcb
AZ-409 (3pt) — FT-P-01 still-image frame-center accuracy: - accuracy_evaluator.py: GT loader + Vincenty error + AC-2/AC-3 pass-counts - test_ft_p_01_still_image_accuracy.py: scenario gated on frame_source_replay + sitl_observer NotImplementedError; AC-4 timeout discipline AZ-412 (3pt) — FT-P-04 Derkachi f2f registration >=95% on normal segments: - registration_classifier.py: accel-derived attitude + overlap heuristic + success ratio with AC-3 sharp-turn exclusion - test_ft_p_04_derkachi_f2f_registration.py: scenario gated on frame_source_replay + imu_replay + fdr_reader AZ-413 (3pt) — FT-P-05 + FT-P-06 cross-domain MRE budgets: - mre_evaluator.py: per-image budget (strict <2.5px) + 95th-percentile via numpy linear interp + combined report - test_ft_p_05_sat_anchor.py: cross-domain scenario, reuses accuracy_evaluator for geodesic join - test_ft_p_06_mre_budgets.py: pure piggyback on FT-P-04 + FT-P-05 CSV evidence; skips when either upstream CSV missing Tests: 325 unit tests pass (+77 vs batch 69). Reports: batch_70_report.md, batch_70_review.md (PASS). Co-authored-by: Cursor <cursoragent@cursor.com>
321 lines
9.8 KiB
Python
321 lines
9.8 KiB
Python
"""Unit tests for ``runner.helpers.mre_evaluator`` (FT-P-05 + FT-P-06 / AZ-413).
|
|
|
|
Covers AC-2 of FT-P-05 (every cross-domain MRE < 2.5 px), AC-3 of FT-P-05
|
|
(accuracy alongside MRE — delegated to ``accuracy_evaluator``), and AC-4
|
|
of FT-P-06 (95th-percentile MRE budgets per domain).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import math
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from runner.helpers.mre_evaluator import (
|
|
MRE_P95_CROSS_DOMAIN_BUDGET_PX,
|
|
MRE_P95_FRAME_TO_FRAME_BUDGET_PX,
|
|
MRE_PER_IMAGE_BUDGET_PX,
|
|
CombinedP95Report,
|
|
CrossDomainRecord,
|
|
FrameToFrameRecord,
|
|
PerImageBudgetReport,
|
|
P95Report,
|
|
evaluate_combined_p95,
|
|
evaluate_p95,
|
|
evaluate_per_image_budget,
|
|
load_cross_domain_csv,
|
|
load_frame_to_frame_csv,
|
|
summarize_mre_distribution,
|
|
write_cross_domain_csv,
|
|
)
|
|
|
|
|
|
def test_constants_match_spec() -> None:
|
|
"""The three budgets must match the AC text."""
|
|
# Assert
|
|
assert MRE_PER_IMAGE_BUDGET_PX == 2.5
|
|
assert MRE_P95_FRAME_TO_FRAME_BUDGET_PX == 1.0
|
|
assert MRE_P95_CROSS_DOMAIN_BUDGET_PX == 2.5
|
|
|
|
|
|
def test_evaluate_per_image_budget_all_pass() -> None:
|
|
"""All MREs under 2.5 → AC-2 passes."""
|
|
# Arrange
|
|
records = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.5, error_m=10.0) for i in range(60)]
|
|
|
|
# Act
|
|
report = evaluate_per_image_budget(records)
|
|
|
|
# Assert
|
|
assert report.total_images == 60
|
|
assert report.pass_count == 60
|
|
assert report.fail_image_ids == ()
|
|
assert report.max_mre_px == 1.5
|
|
assert report.passes is True
|
|
|
|
|
|
def test_evaluate_per_image_budget_single_fail_fails_overall() -> None:
|
|
"""One MRE at the boundary → fails (strict < 2.5)."""
|
|
# Arrange — 59 pass, 1 at exactly 2.5
|
|
records = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.0, error_m=5.0) for i in range(59)]
|
|
records.append(CrossDomainRecord("AD000060.jpg", mre_px=2.5, error_m=5.0))
|
|
|
|
# Act
|
|
report = evaluate_per_image_budget(records)
|
|
|
|
# Assert
|
|
assert report.pass_count == 59
|
|
assert report.fail_image_ids == ("AD000060.jpg",)
|
|
assert report.passes is False
|
|
|
|
|
|
def test_evaluate_per_image_budget_above_boundary_fails() -> None:
|
|
"""An MRE strictly above 2.5 fails."""
|
|
# Arrange
|
|
records = [
|
|
CrossDomainRecord("a", mre_px=1.0, error_m=5.0),
|
|
CrossDomainRecord("b", mre_px=3.0, error_m=15.0),
|
|
]
|
|
|
|
# Act
|
|
report = evaluate_per_image_budget(records)
|
|
|
|
# Assert
|
|
assert report.fail_image_ids == ("b",)
|
|
assert report.passes is False
|
|
assert report.max_mre_px == 3.0
|
|
|
|
|
|
def test_evaluate_per_image_budget_empty_list_does_not_pass() -> None:
|
|
"""Zero records → does NOT pass (no positive evidence of compliance)."""
|
|
# Act
|
|
report = evaluate_per_image_budget([])
|
|
|
|
# Assert
|
|
assert report.passes is False
|
|
|
|
|
|
def test_evaluate_per_image_budget_rejects_zero_budget() -> None:
|
|
# Act / Assert
|
|
with pytest.raises(ValueError, match="budget_px must be > 0"):
|
|
evaluate_per_image_budget([], budget_px=0.0)
|
|
|
|
|
|
def test_evaluate_p95_uses_numpy_linear_interpolation() -> None:
|
|
"""Spec mandates numpy's default percentile algorithm; verify match."""
|
|
# Arrange — 20 samples uniformly from 0.1 to 2.0.
|
|
samples = [round(0.1 * i, 2) for i in range(1, 21)]
|
|
expected_p95 = float(np.percentile(np.asarray(samples, dtype=float), 95))
|
|
|
|
# Act
|
|
report = evaluate_p95(samples, budget_px=2.5)
|
|
|
|
# Assert
|
|
assert report.sample_count == 20
|
|
assert report.p95_px == pytest.approx(expected_p95)
|
|
assert report.passes is True
|
|
|
|
|
|
def test_evaluate_p95_passes_when_below_budget() -> None:
|
|
"""p95 < 1.0 → passes for the frame-to-frame budget."""
|
|
# Arrange — 100 samples mostly below 1.0
|
|
samples = [0.5] * 95 + [0.9] * 5 # p95 = 0.5 (linear interp)
|
|
|
|
# Act
|
|
report = evaluate_p95(samples, budget_px=MRE_P95_FRAME_TO_FRAME_BUDGET_PX)
|
|
|
|
# Assert
|
|
assert report.passes is True
|
|
|
|
|
|
def test_evaluate_p95_fails_when_above_budget() -> None:
|
|
"""p95 ≥ 1.0 → fails."""
|
|
# Arrange
|
|
samples = [0.5] * 90 + [1.5] * 10 # p95 ≈ 1.5
|
|
|
|
# Act
|
|
report = evaluate_p95(samples, budget_px=MRE_P95_FRAME_TO_FRAME_BUDGET_PX)
|
|
|
|
# Assert
|
|
assert report.passes is False
|
|
assert report.p95_px == pytest.approx(1.5, abs=1e-6)
|
|
|
|
|
|
def test_evaluate_p95_empty_input_does_not_pass() -> None:
|
|
"""Zero samples → NaN p95, does not pass."""
|
|
# Act
|
|
report = evaluate_p95([], budget_px=2.5)
|
|
|
|
# Assert
|
|
assert report.sample_count == 0
|
|
assert math.isnan(report.p95_px)
|
|
assert report.passes is False
|
|
|
|
|
|
def test_evaluate_p95_rejects_zero_budget() -> None:
|
|
# Act / Assert
|
|
with pytest.raises(ValueError, match="budget_px must be > 0"):
|
|
evaluate_p95([1.0], budget_px=0.0)
|
|
|
|
|
|
def test_evaluate_combined_p95_both_pass() -> None:
|
|
"""Both domains below their budgets → combined report passes."""
|
|
# Arrange
|
|
f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.4) for i in range(100)]
|
|
xd = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.0, error_m=5.0) for i in range(60)]
|
|
|
|
# Act
|
|
report = evaluate_combined_p95(f2f, xd)
|
|
|
|
# Assert
|
|
assert report.frame_to_frame.passes is True
|
|
assert report.cross_domain.passes is True
|
|
assert report.passes is True
|
|
|
|
|
|
def test_evaluate_combined_p95_fails_when_frame_to_frame_fails() -> None:
|
|
"""f2f p95 ≥ 1.0 → combined fails even if cross-domain passes."""
|
|
# Arrange — f2f p95 ≈ 1.5, cross-domain p95 ≈ 1.0
|
|
f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.5) for i in range(90)] + [
|
|
FrameToFrameRecord(frame_index=i, mre_px=1.5) for i in range(90, 100)
|
|
]
|
|
xd = [CrossDomainRecord(f"a{i}", mre_px=1.0, error_m=5.0) for i in range(60)]
|
|
|
|
# Act
|
|
report = evaluate_combined_p95(f2f, xd)
|
|
|
|
# Assert
|
|
assert report.frame_to_frame.passes is False
|
|
assert report.cross_domain.passes is True
|
|
assert report.passes is False
|
|
|
|
|
|
def test_evaluate_combined_p95_fails_when_cross_domain_fails() -> None:
|
|
"""cross-domain p95 ≥ 2.5 → combined fails even if f2f passes."""
|
|
# Arrange
|
|
f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.5) for i in range(100)]
|
|
xd = [CrossDomainRecord(f"a{i}", mre_px=1.0, error_m=5.0) for i in range(54)] + [
|
|
CrossDomainRecord(f"b{i}", mre_px=3.0, error_m=5.0) for i in range(6)
|
|
]
|
|
|
|
# Act
|
|
report = evaluate_combined_p95(f2f, xd)
|
|
|
|
# Assert
|
|
assert report.cross_domain.passes is False
|
|
assert report.passes is False
|
|
|
|
|
|
def test_write_cross_domain_csv_round_trip(tmp_path: Path) -> None:
|
|
"""write + read returns the same records."""
|
|
# Arrange
|
|
records = [
|
|
CrossDomainRecord("AD000001.jpg", mre_px=1.234, error_m=12.345),
|
|
CrossDomainRecord("AD000002.jpg", mre_px=2.6, error_m=200.0),
|
|
]
|
|
out = tmp_path / "ft-p-05.csv"
|
|
|
|
# Act
|
|
write_cross_domain_csv(out, records)
|
|
loaded = load_cross_domain_csv(out)
|
|
|
|
# Assert
|
|
assert len(loaded) == 2
|
|
assert loaded[0].image_id == "AD000001.jpg"
|
|
assert loaded[0].mre_px == pytest.approx(1.234, abs=1e-3)
|
|
assert loaded[1].mre_px == pytest.approx(2.6, abs=1e-3)
|
|
|
|
|
|
def test_write_cross_domain_csv_emits_pass_mre_column(tmp_path: Path) -> None:
|
|
"""Each row's pass_mre cell reflects the < 2.5 strict comparison."""
|
|
# Arrange
|
|
records = [
|
|
CrossDomainRecord("a", mre_px=1.0, error_m=5.0),
|
|
CrossDomainRecord("b", mre_px=2.5, error_m=5.0),
|
|
CrossDomainRecord("c", mre_px=2.499, error_m=5.0),
|
|
]
|
|
out = tmp_path / "ft-p-05.csv"
|
|
|
|
# Act
|
|
write_cross_domain_csv(out, records)
|
|
rows = list(csv.reader(out.open()))
|
|
|
|
# Assert
|
|
assert rows[1][7] == "true" # a (1.0 px)
|
|
assert rows[2][7] == "false" # b (2.5 px — strict <)
|
|
assert rows[3][7] == "true" # c (2.499 px)
|
|
|
|
|
|
def test_load_cross_domain_csv_rejects_missing_file(tmp_path: Path) -> None:
|
|
# Act / Assert
|
|
with pytest.raises(FileNotFoundError):
|
|
load_cross_domain_csv(tmp_path / "missing.csv")
|
|
|
|
|
|
def test_load_cross_domain_csv_rejects_missing_columns(tmp_path: Path) -> None:
|
|
# Arrange
|
|
bad = tmp_path / "bad.csv"
|
|
bad.write_text("image_id,mre_px\nx,1.0\n")
|
|
|
|
# Act / Assert
|
|
with pytest.raises(ValueError, match="missing columns"):
|
|
load_cross_domain_csv(bad)
|
|
|
|
|
|
def test_load_frame_to_frame_csv_rejects_missing_mre_column(tmp_path: Path) -> None:
|
|
"""If FT-P-04 evidence lacks mre_px, FT-P-06 must fail loudly."""
|
|
# Arrange
|
|
bad = tmp_path / "ft-p-04.csv"
|
|
bad.write_text(
|
|
"frame_index,imu_row_index,bank_deg,pitch_deg,translation_m,overlap_fraction,is_normal,excluded_reason,registration_success\n"
|
|
"0,0,0.0,0.0,0.0,1.0,true,,true\n"
|
|
)
|
|
|
|
# Act / Assert
|
|
with pytest.raises(ValueError, match="mre_px"):
|
|
load_frame_to_frame_csv(bad)
|
|
|
|
|
|
def test_load_frame_to_frame_csv_round_trip(tmp_path: Path) -> None:
|
|
"""When mre_px is present, records parse correctly."""
|
|
# Arrange
|
|
good = tmp_path / "ft-p-04.csv"
|
|
good.write_text(
|
|
"frame_index,mre_px\n0,0.5\n1,0.7\n2,\n3,1.1\n"
|
|
)
|
|
|
|
# Act
|
|
records = load_frame_to_frame_csv(good)
|
|
|
|
# Assert — blank mre_px rows are skipped.
|
|
assert [r.frame_index for r in records] == [0, 1, 3]
|
|
assert records[0].mre_px == 0.5
|
|
|
|
|
|
def test_summarize_mre_distribution_basic_stats() -> None:
|
|
"""median / p95 / max / count for a tiny sample."""
|
|
# Arrange
|
|
records = [FrameToFrameRecord(frame_index=i, mre_px=float(i)) for i in range(10)]
|
|
|
|
# Act
|
|
summary = summarize_mre_distribution(records)
|
|
|
|
# Assert
|
|
assert summary["count"] == 10
|
|
assert summary["median"] == pytest.approx(4.5)
|
|
assert summary["max"] == 9.0
|
|
assert summary["p95"] == pytest.approx(np.percentile(np.arange(10, dtype=float), 95))
|
|
|
|
|
|
def test_summarize_mre_distribution_empty_returns_nan() -> None:
|
|
# Act
|
|
summary = summarize_mre_distribution([])
|
|
|
|
# Assert
|
|
assert summary["count"] == 0
|
|
assert math.isnan(summary["median"])
|
|
assert math.isnan(summary["p95"])
|