"""Unit tests for ``runner.helpers.mre_evaluator`` (FT-P-05 + FT-P-06 / AZ-413). Covers AC-2 of FT-P-05 (every cross-domain MRE < 2.5 px), AC-3 of FT-P-05 (accuracy alongside MRE — delegated to ``accuracy_evaluator``), and AC-4 of FT-P-06 (95th-percentile MRE budgets per domain). """ from __future__ import annotations import csv import math from pathlib import Path import numpy as np import pytest from runner.helpers.mre_evaluator import ( MRE_P95_CROSS_DOMAIN_BUDGET_PX, MRE_P95_FRAME_TO_FRAME_BUDGET_PX, MRE_PER_IMAGE_BUDGET_PX, CombinedP95Report, CrossDomainRecord, FrameToFrameRecord, PerImageBudgetReport, P95Report, evaluate_combined_p95, evaluate_p95, evaluate_per_image_budget, load_cross_domain_csv, load_frame_to_frame_csv, summarize_mre_distribution, write_cross_domain_csv, ) def test_constants_match_spec() -> None: """The three budgets must match the AC text.""" # Assert assert MRE_PER_IMAGE_BUDGET_PX == 2.5 assert MRE_P95_FRAME_TO_FRAME_BUDGET_PX == 1.0 assert MRE_P95_CROSS_DOMAIN_BUDGET_PX == 2.5 def test_evaluate_per_image_budget_all_pass() -> None: """All MREs under 2.5 → AC-2 passes.""" # Arrange records = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.5, error_m=10.0) for i in range(60)] # Act report = evaluate_per_image_budget(records) # Assert assert report.total_images == 60 assert report.pass_count == 60 assert report.fail_image_ids == () assert report.max_mre_px == 1.5 assert report.passes is True def test_evaluate_per_image_budget_single_fail_fails_overall() -> None: """One MRE at the boundary → fails (strict < 2.5).""" # Arrange — 59 pass, 1 at exactly 2.5 records = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.0, error_m=5.0) for i in range(59)] records.append(CrossDomainRecord("AD000060.jpg", mre_px=2.5, error_m=5.0)) # Act report = evaluate_per_image_budget(records) # Assert assert report.pass_count == 59 assert report.fail_image_ids == ("AD000060.jpg",) assert report.passes is False def test_evaluate_per_image_budget_above_boundary_fails() -> None: """An MRE strictly above 2.5 fails.""" # Arrange records = [ CrossDomainRecord("a", mre_px=1.0, error_m=5.0), CrossDomainRecord("b", mre_px=3.0, error_m=15.0), ] # Act report = evaluate_per_image_budget(records) # Assert assert report.fail_image_ids == ("b",) assert report.passes is False assert report.max_mre_px == 3.0 def test_evaluate_per_image_budget_empty_list_does_not_pass() -> None: """Zero records → does NOT pass (no positive evidence of compliance).""" # Act report = evaluate_per_image_budget([]) # Assert assert report.passes is False def test_evaluate_per_image_budget_rejects_zero_budget() -> None: # Act / Assert with pytest.raises(ValueError, match="budget_px must be > 0"): evaluate_per_image_budget([], budget_px=0.0) def test_evaluate_p95_uses_numpy_linear_interpolation() -> None: """Spec mandates numpy's default percentile algorithm; verify match.""" # Arrange — 20 samples uniformly from 0.1 to 2.0. samples = [round(0.1 * i, 2) for i in range(1, 21)] expected_p95 = float(np.percentile(np.asarray(samples, dtype=float), 95)) # Act report = evaluate_p95(samples, budget_px=2.5) # Assert assert report.sample_count == 20 assert report.p95_px == pytest.approx(expected_p95) assert report.passes is True def test_evaluate_p95_passes_when_below_budget() -> None: """p95 < 1.0 → passes for the frame-to-frame budget.""" # Arrange — 100 samples mostly below 1.0 samples = [0.5] * 95 + [0.9] * 5 # p95 = 0.5 (linear interp) # Act report = evaluate_p95(samples, budget_px=MRE_P95_FRAME_TO_FRAME_BUDGET_PX) # Assert assert report.passes is True def test_evaluate_p95_fails_when_above_budget() -> None: """p95 ≥ 1.0 → fails.""" # Arrange samples = [0.5] * 90 + [1.5] * 10 # p95 ≈ 1.5 # Act report = evaluate_p95(samples, budget_px=MRE_P95_FRAME_TO_FRAME_BUDGET_PX) # Assert assert report.passes is False assert report.p95_px == pytest.approx(1.5, abs=1e-6) def test_evaluate_p95_empty_input_does_not_pass() -> None: """Zero samples → NaN p95, does not pass.""" # Act report = evaluate_p95([], budget_px=2.5) # Assert assert report.sample_count == 0 assert math.isnan(report.p95_px) assert report.passes is False def test_evaluate_p95_rejects_zero_budget() -> None: # Act / Assert with pytest.raises(ValueError, match="budget_px must be > 0"): evaluate_p95([1.0], budget_px=0.0) def test_evaluate_combined_p95_both_pass() -> None: """Both domains below their budgets → combined report passes.""" # Arrange f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.4) for i in range(100)] xd = [CrossDomainRecord(f"AD{i:06d}.jpg", mre_px=1.0, error_m=5.0) for i in range(60)] # Act report = evaluate_combined_p95(f2f, xd) # Assert assert report.frame_to_frame.passes is True assert report.cross_domain.passes is True assert report.passes is True def test_evaluate_combined_p95_fails_when_frame_to_frame_fails() -> None: """f2f p95 ≥ 1.0 → combined fails even if cross-domain passes.""" # Arrange — f2f p95 ≈ 1.5, cross-domain p95 ≈ 1.0 f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.5) for i in range(90)] + [ FrameToFrameRecord(frame_index=i, mre_px=1.5) for i in range(90, 100) ] xd = [CrossDomainRecord(f"a{i}", mre_px=1.0, error_m=5.0) for i in range(60)] # Act report = evaluate_combined_p95(f2f, xd) # Assert assert report.frame_to_frame.passes is False assert report.cross_domain.passes is True assert report.passes is False def test_evaluate_combined_p95_fails_when_cross_domain_fails() -> None: """cross-domain p95 ≥ 2.5 → combined fails even if f2f passes.""" # Arrange f2f = [FrameToFrameRecord(frame_index=i, mre_px=0.5) for i in range(100)] xd = [CrossDomainRecord(f"a{i}", mre_px=1.0, error_m=5.0) for i in range(54)] + [ CrossDomainRecord(f"b{i}", mre_px=3.0, error_m=5.0) for i in range(6) ] # Act report = evaluate_combined_p95(f2f, xd) # Assert assert report.cross_domain.passes is False assert report.passes is False def test_write_cross_domain_csv_round_trip(tmp_path: Path) -> None: """write + read returns the same records.""" # Arrange records = [ CrossDomainRecord("AD000001.jpg", mre_px=1.234, error_m=12.345), CrossDomainRecord("AD000002.jpg", mre_px=2.6, error_m=200.0), ] out = tmp_path / "ft-p-05.csv" # Act write_cross_domain_csv(out, records) loaded = load_cross_domain_csv(out) # Assert assert len(loaded) == 2 assert loaded[0].image_id == "AD000001.jpg" assert loaded[0].mre_px == pytest.approx(1.234, abs=1e-3) assert loaded[1].mre_px == pytest.approx(2.6, abs=1e-3) def test_write_cross_domain_csv_emits_pass_mre_column(tmp_path: Path) -> None: """Each row's pass_mre cell reflects the < 2.5 strict comparison.""" # Arrange records = [ CrossDomainRecord("a", mre_px=1.0, error_m=5.0), CrossDomainRecord("b", mre_px=2.5, error_m=5.0), CrossDomainRecord("c", mre_px=2.499, error_m=5.0), ] out = tmp_path / "ft-p-05.csv" # Act write_cross_domain_csv(out, records) rows = list(csv.reader(out.open())) # Assert assert rows[1][7] == "true" # a (1.0 px) assert rows[2][7] == "false" # b (2.5 px — strict <) assert rows[3][7] == "true" # c (2.499 px) def test_load_cross_domain_csv_rejects_missing_file(tmp_path: Path) -> None: # Act / Assert with pytest.raises(FileNotFoundError): load_cross_domain_csv(tmp_path / "missing.csv") def test_load_cross_domain_csv_rejects_missing_columns(tmp_path: Path) -> None: # Arrange bad = tmp_path / "bad.csv" bad.write_text("image_id,mre_px\nx,1.0\n") # Act / Assert with pytest.raises(ValueError, match="missing columns"): load_cross_domain_csv(bad) def test_load_frame_to_frame_csv_rejects_missing_mre_column(tmp_path: Path) -> None: """If FT-P-04 evidence lacks mre_px, FT-P-06 must fail loudly.""" # Arrange bad = tmp_path / "ft-p-04.csv" bad.write_text( "frame_index,imu_row_index,bank_deg,pitch_deg,translation_m,overlap_fraction,is_normal,excluded_reason,registration_success\n" "0,0,0.0,0.0,0.0,1.0,true,,true\n" ) # Act / Assert with pytest.raises(ValueError, match="mre_px"): load_frame_to_frame_csv(bad) def test_load_frame_to_frame_csv_round_trip(tmp_path: Path) -> None: """When mre_px is present, records parse correctly.""" # Arrange good = tmp_path / "ft-p-04.csv" good.write_text( "frame_index,mre_px\n0,0.5\n1,0.7\n2,\n3,1.1\n" ) # Act records = load_frame_to_frame_csv(good) # Assert — blank mre_px rows are skipped. assert [r.frame_index for r in records] == [0, 1, 3] assert records[0].mre_px == 0.5 def test_summarize_mre_distribution_basic_stats() -> None: """median / p95 / max / count for a tiny sample.""" # Arrange records = [FrameToFrameRecord(frame_index=i, mre_px=float(i)) for i in range(10)] # Act summary = summarize_mre_distribution(records) # Assert assert summary["count"] == 10 assert summary["median"] == pytest.approx(4.5) assert summary["max"] == 9.0 assert summary["p95"] == pytest.approx(np.percentile(np.arange(10, dtype=float), 95)) def test_summarize_mre_distribution_empty_returns_nan() -> None: # Act summary = summarize_mre_distribution([]) # Assert assert summary["count"] == 0 assert math.isnan(summary["median"]) assert math.isnan(summary["p95"])