[AZ-440] [AZ-441] [AZ-442] [AZ-443] NFT-LIM-01/02/03+05/04 blackbox scenarios

Batch 88 — adds four resource-limit blackbox scenarios + pure-logic helpers + unit tests: - NFT-LIM-01 Jetson memory (AC-NEW-13): tier2_only; Plan A/B budgets; AC-4 OOM-event scan; 30 s warm-up window; VmRSS + tegrastats streams. - NFT-LIM-02 FDR size (AC-7.3): 30 min → 8 h linear extrapolation against 50 GiB; ±60 s replay-window slack for AC-1. - NFT-LIM-03+05 storage (AC-7.4 + AC-NEW-12 + RESTRICT-STORAGE): aggregate ≤ 100 GiB across tile-cache + tile-cache-write + fdr-output; thumbnail-log < 1 GiB strict 8 h-extrapolated. - NFT-LIM-04 thermal (AC-NEW-5 PARTIAL): tier2_only; CPU/SoC p99 ≤ T_throttle − 5 °C; throttle-event scan; PARTIAL annotation written to traceability-status.json. Thresholds fixture lives at e2e/fixtures/jetson/thermal-thresholds.json (moved from the task spec's suggested tests/fixtures/ path so the file stays inside the blackbox_tests Owns: e2e/** envelope). All four helpers are public-boundary-only (no src/gps_denied_onboard imports). Scenarios skip cleanly in the Tier-1 docker harness pending AZ-595 (SITL replay builder) for the four shared fixture inputs and AZ-444 (Tier-2 Jetson runner) for the tier2_only scenarios. Code review: PASS_WITH_WARNINGS (0/0/2/1). Both Mediums are carried-over write_csv_evidence + _resolve_fixture_path duplication, deferred to AZ-446 (batch 89). Low is the self-resolved AZ-443 fixture ownership drift documented in the review. Tests: 1223 e2e/_unit_tests passing (+1 vs. batch 87 from the new directory-layout entry); 24 resource_limit scenarios collect and skip cleanly under runner/pytest.ini. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-22 19:31:15 +00:00 · 2026-05-17 18:01:55 +03:00
parent d1e30f818f
commit 6e4a575221
22 changed files with 2785 additions and 4 deletions
@@ -0,0 +1,183 @@
+"""Unit tests for ``runner.helpers.fdr_size_evaluator`` (AZ-441 / NFT-LIM-02)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import fdr_size_evaluator as fse
+
+THIRTY_MIN_MS = 30 * 60 * 1000
+
+
+def _linear_samples(
+    *, size_at_30min_bytes: int, sample_count: int = 31
+) -> list[fse.FdrSizeSample]:
+    """Per-minute sweep from 0 → size_at_30min_bytes."""
+    return [
+        fse.FdrSizeSample(
+            monotonic_ms=i * 60_000,
+            size_bytes=int(size_at_30min_bytes * i / (sample_count - 1)),
+        )
+        for i in range(sample_count)
+    ]
+
+
+# ───────────────────────── evaluate ─────────────────────────
+
+
+def test_evaluate_under_budget_passes() -> None:
+    # Arrange — 1 GiB at 30 min → extrapolated 16 GiB at 8 h (well under 50 GiB)
+    samples = _linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES)
+
+    # Act
+    report = fse.evaluate(samples)
+
+    # Assert
+    assert report.passes_replay_window
+    assert report.passes_extrapolation
+    assert report.passes
+    assert report.extrapolated_8h_bytes == 16 * fse.GIB_BYTES
+
+
+def test_evaluate_at_budget_passes_ac_2() -> None:
+    # Arrange — 50 GiB / 16 = 3.125 GiB at 30 min → 50 GiB extrapolated (equals budget)
+    size_at_30 = int((50 * fse.GIB_BYTES) / 16)
+    samples = _linear_samples(size_at_30min_bytes=size_at_30)
+
+    # Act
+    report = fse.evaluate(samples)
+
+    # Assert — AC-2 is "≤" so exactly-at-budget passes
+    assert report.passes_extrapolation
+
+
+def test_evaluate_over_budget_fails_ac_2() -> None:
+    # Arrange — 4 GiB at 30 min → 64 GiB extrapolated (> 50 GiB budget)
+    samples = _linear_samples(size_at_30min_bytes=4 * fse.GIB_BYTES)
+
+    # Act
+    report = fse.evaluate(samples)
+
+    # Assert
+    assert not report.passes_extrapolation
+    assert not report.passes
+    assert report.extrapolated_8h_bytes == 64 * fse.GIB_BYTES
+
+
+def test_evaluate_short_window_fails_ac_1() -> None:
+    # Arrange — only 5 min of samples; AC-1 demands the runner replay 30 min
+    samples = [
+        fse.FdrSizeSample(monotonic_ms=i * 60_000, size_bytes=i * (10 * 1024**2))
+        for i in range(6)
+    ]
+
+    # Act
+    report = fse.evaluate(samples)
+
+    # Assert
+    assert not report.passes_replay_window
+    assert not report.passes
+
+
+def test_evaluate_window_within_slack_passes_ac_1() -> None:
+    # Arrange — 30 min ± 30 s; default slack is 60 s
+    samples = _linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES, sample_count=31)
+    # Move the final sample to 30 min + 30 s
+    samples[-1] = fse.FdrSizeSample(
+        monotonic_ms=THIRTY_MIN_MS + 30_000,
+        size_bytes=samples[-1].size_bytes,
+    )
+
+    # Act
+    report = fse.evaluate(samples)
+
+    # Assert
+    assert report.passes_replay_window
+
+
+def test_evaluate_unsorted_samples_still_correct() -> None:
+    # Arrange — same data, shuffled
+    samples = _linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES)
+    shuffled = list(reversed(samples))
+
+    # Act
+    report_a = fse.evaluate(samples)
+    report_b = fse.evaluate(shuffled)
+
+    # Assert
+    assert report_a.size_at_30min_bytes == report_b.size_at_30min_bytes
+    assert report_a.extrapolated_8h_bytes == report_b.extrapolated_8h_bytes
+    assert report_a.replay_window_ms == report_b.replay_window_ms
+
+
+def test_evaluate_empty_samples_returns_zero_report() -> None:
+    # Act
+    report = fse.evaluate([])
+
+    # Assert
+    assert report.sample_count == 0
+    assert report.size_at_30min_bytes is None
+    assert report.extrapolated_8h_bytes is None
+    assert not report.passes_replay_window
+    assert not report.passes_extrapolation
+
+
+def test_evaluate_rejects_non_positive_budget() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        fse.evaluate([], budget_bytes=0)
+
+
+def test_evaluate_rejects_negative_slack() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        fse.evaluate([], replay_window_slack_ms=-1)
+
+
+def test_evaluate_custom_budget_overrides_default() -> None:
+    # Arrange — 1 GiB at 30 min → 16 GiB at 8 h; custom budget = 8 GiB → fail
+    samples = _linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES)
+
+    # Act
+    report = fse.evaluate(samples, budget_bytes=8 * fse.GIB_BYTES)
+
+    # Assert
+    assert not report.passes_extrapolation
+    assert report.budget_bytes == 8 * fse.GIB_BYTES
+
+
+# ───────────────────────── CSV evidence ─────────────────────────
+
+
+def test_write_csv_evidence_one_row(tmp_path: Path) -> None:
+    # Arrange
+    samples = _linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES)
+    report = fse.evaluate(samples)
+    out = tmp_path / "report.csv"
+
+    # Act
+    fse.write_csv_evidence(out, report)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "sample_count"
+    assert rows[1][-1] == "true"
+
+
+def test_write_per_minute_csv_orders_by_timestamp(tmp_path: Path) -> None:
+    # Arrange
+    samples = list(reversed(_linear_samples(size_at_30min_bytes=1 * fse.GIB_BYTES)))
+    out = tmp_path / "per-min.csv"
+
+    # Act
+    fse.write_per_minute_csv(out, samples)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    timestamps = [int(r[1]) for r in rows[1:]]
+    assert timestamps == sorted(timestamps)
@@ -0,0 +1,255 @@
+"""Unit tests for ``runner.helpers.memory_budget_evaluator`` (AZ-440 / NFT-LIM-01)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import memory_budget_evaluator as mbe
+
+
+# ───────────────────────── PlanBudgets ─────────────────────────
+
+
+def test_plan_a_budgets_match_ac_2_and_ac_3() -> None:
+    # Assert
+    budgets = mbe.PlanBudgets.for_plan(mbe.Plan.PLAN_A)
+    assert budgets.steady_bytes == int(4.5 * mbe.GIB_BYTES)
+    assert budgets.peak_bytes == int(5.0 * mbe.GIB_BYTES)
+
+
+def test_plan_b_budgets_match_ac_5() -> None:
+    # Assert
+    budgets = mbe.PlanBudgets.for_plan(mbe.Plan.PLAN_B)
+    assert budgets.steady_bytes == int(6.0 * mbe.GIB_BYTES)
+    assert budgets.peak_bytes == int(6.5 * mbe.GIB_BYTES)
+
+
+def test_plan_budgets_rejects_unknown_plan() -> None:
+    # Assert
+    class _FakePlan:
+        pass
+
+    with pytest.raises(ValueError):
+        mbe.PlanBudgets.for_plan(_FakePlan())  # type: ignore[arg-type]
+
+
+# ───────────────────────── _percentile_int ─────────────────────
+
+
+def test_percentile_int_q_must_be_in_range() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        mbe._percentile_int([1, 2, 3], -1.0)
+    with pytest.raises(ValueError):
+        mbe._percentile_int([1, 2, 3], 101.0)
+
+
+def test_percentile_int_empty_returns_none() -> None:
+    # Assert
+    assert mbe._percentile_int([], 50.0) is None
+
+
+def test_percentile_int_single_value_returns_that_value() -> None:
+    # Assert
+    assert mbe._percentile_int([42], 0.0) == 42
+    assert mbe._percentile_int([42], 50.0) == 42
+    assert mbe._percentile_int([42], 100.0) == 42
+
+
+def test_percentile_int_linear_interpolation_then_rounded() -> None:
+    # Arrange — 100..1000 step 100
+    values = list(range(100, 1001, 100))
+
+    # Assert
+    assert mbe._percentile_int(values, 50.0) == 550  # even-length midpoint
+    assert mbe._percentile_int(values, 100.0) == 1000
+    assert mbe._percentile_int(values, 0.0) == 100
+
+
+# ───────────────────────── _post_warmup_window ─────────────────
+
+
+def test_post_warmup_drops_samples_inside_warmup_window() -> None:
+    # Arrange
+    samples = [
+        mbe.MemorySample(monotonic_ms=t, vmrss_bytes=t, tegrastats_used_bytes=t)
+        for t in (0, 10_000, 20_000, 30_000, 31_000)
+    ]
+
+    # Act
+    kept = mbe._post_warmup_window(samples, warm_up_ms=30_000)
+
+    # Assert
+    assert [s.monotonic_ms for s in kept] == [30_000, 31_000]
+
+
+def test_post_warmup_rejects_negative_warmup() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        mbe._post_warmup_window([], warm_up_ms=-1)
+
+
+def test_post_warmup_empty_samples_returns_empty() -> None:
+    # Assert
+    assert mbe._post_warmup_window([], warm_up_ms=30_000) == []
+
+
+# ───────────────────────── evaluate ────────────────────────────
+
+
+def _flat_samples(
+    n: int, *, vmrss: int, tegrastats: int, start_ms: int = 0
+) -> list[mbe.MemorySample]:
+    return [
+        mbe.MemorySample(
+            monotonic_ms=start_ms + i * 1000,
+            vmrss_bytes=vmrss,
+            tegrastats_used_bytes=tegrastats,
+        )
+        for i in range(n)
+    ]
+
+
+def test_evaluate_plan_a_under_budget_passes() -> None:
+    # Arrange — both streams constant at 4.0 GiB; well under 4.5/5.0 budget
+    bytes_4gib = 4 * mbe.GIB_BYTES
+    samples = _flat_samples(60, vmrss=bytes_4gib, tegrastats=bytes_4gib, start_ms=30_000)
+
+    # Act
+    report = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+
+    # Assert
+    assert report.passes_steady_state
+    assert report.passes_peak
+    assert report.passes_no_oom
+    assert report.passes
+
+
+def test_evaluate_plan_a_steady_breach_fails_ac_2() -> None:
+    # Arrange — steady 4.8 GiB > 4.5 GiB budget
+    bytes_48 = int(4.8 * mbe.GIB_BYTES)
+    samples = _flat_samples(20, vmrss=bytes_48, tegrastats=bytes_48)
+
+    # Act
+    report = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+
+    # Assert
+    assert not report.passes_steady_state
+    assert not report.passes
+
+
+def test_evaluate_plan_a_peak_breach_fails_ac_3() -> None:
+    # Arrange — most under, one spike just over peak budget
+    under = int(4.0 * mbe.GIB_BYTES)
+    spike = int(5.5 * mbe.GIB_BYTES)
+    samples = _flat_samples(20, vmrss=under, tegrastats=under)
+    samples = list(samples)
+    samples[10] = mbe.MemorySample(
+        monotonic_ms=samples[10].monotonic_ms,
+        vmrss_bytes=spike,
+        tegrastats_used_bytes=under,
+    )
+
+    # Act
+    report = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+
+    # Assert — VmRSS max breaches peak budget
+    assert not report.passes_peak
+    assert not report.passes
+
+
+def test_evaluate_plan_b_relaxes_budgets() -> None:
+    # Arrange — 5.5 GiB steady would breach Plan A but pass Plan B
+    bytes_55 = int(5.5 * mbe.GIB_BYTES)
+    samples = _flat_samples(20, vmrss=bytes_55, tegrastats=bytes_55)
+
+    # Act
+    report_a = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+    report_b = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_B, warm_up_ms=0)
+
+    # Assert
+    assert not report_a.passes_steady_state
+    assert report_b.passes_steady_state
+
+
+def test_evaluate_with_oom_event_fails_ac_4() -> None:
+    # Arrange
+    bytes_3gib = 3 * mbe.GIB_BYTES
+    samples = _flat_samples(20, vmrss=bytes_3gib, tegrastats=bytes_3gib)
+    oom = [mbe.OomEvent(monotonic_ms=12_345, snippet="Out of memory: Killed process 4242")]
+
+    # Act
+    report = mbe.evaluate(samples, oom_events=oom, plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+
+    # Assert
+    assert not report.passes_no_oom
+    assert not report.passes
+
+
+def test_evaluate_warmup_eliminates_spike_during_warmup() -> None:
+    # Arrange — spike inside warm-up + clean afterwards; AC-2/3 evaluate
+    # the POST-warm-up window only, so the run should pass.
+    bytes_3gib = 3 * mbe.GIB_BYTES
+    spike = int(7.0 * mbe.GIB_BYTES)
+    samples = [
+        mbe.MemorySample(monotonic_ms=0, vmrss_bytes=spike, tegrastats_used_bytes=spike),
+        *_flat_samples(20, vmrss=bytes_3gib, tegrastats=bytes_3gib, start_ms=30_000),
+    ]
+
+    # Act
+    report = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=30_000)
+
+    # Assert
+    assert report.passes
+    assert (report.vmrss.max_bytes or 0) == bytes_3gib
+
+
+def test_evaluate_empty_samples_returns_none_stats() -> None:
+    # Act
+    report = mbe.evaluate([], oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+
+    # Assert
+    assert report.vmrss.p50_bytes is None
+    assert report.tegrastats.p50_bytes is None
+    assert not report.passes_steady_state
+    assert not report.passes_peak
+
+
+# ───────────────────────── CSV evidence ─────────────────────────
+
+
+def test_write_csv_evidence_one_row_with_verdict(tmp_path: Path) -> None:
+    # Arrange
+    bytes_3gib = 3 * mbe.GIB_BYTES
+    samples = _flat_samples(10, vmrss=bytes_3gib, tegrastats=bytes_3gib)
+    report = mbe.evaluate(samples, oom_events=[], plan=mbe.Plan.PLAN_A, warm_up_ms=0)
+    out = tmp_path / "lim-01" / "report.csv"
+
+    # Act
+    returned = mbe.write_csv_evidence(out, report)
+
+    # Assert
+    assert returned == out
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "plan"
+    assert rows[1][0] == mbe.Plan.PLAN_A.value
+    assert rows[1][-1] == "true"
+
+
+def test_write_oom_events_csv_truncates_long_snippet(tmp_path: Path) -> None:
+    # Arrange
+    long_snippet = "X" * 500
+    events = [mbe.OomEvent(monotonic_ms=1, snippet=long_snippet)]
+    out = tmp_path / "lim-01" / "oom.csv"
+
+    # Act
+    mbe.write_oom_events_csv(out, events)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert len(rows[1][2]) == 200
@@ -0,0 +1,238 @@
+"""Unit tests for ``runner.helpers.storage_budget_evaluator`` (AZ-442 / NFT-LIM-03+05)."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import storage_budget_evaluator as sbe
+
+GIB = sbe.GIB_BYTES
+
+
+def _snapshot(
+    monotonic_ms: int,
+    *,
+    tile_cache: int = 0,
+    tile_cache_write: int = 0,
+    fdr_output: int = 0,
+    thumbnail_log: int = 0,
+) -> sbe.VolumeSnapshot:
+    return sbe.VolumeSnapshot(
+        monotonic_ms=monotonic_ms,
+        tile_cache_bytes=tile_cache,
+        tile_cache_write_bytes=tile_cache_write,
+        fdr_output_bytes=fdr_output,
+        thumbnail_log_bytes=thumbnail_log,
+    )
+
+
+# ───────────────────────── VolumeSnapshot.aggregate_bytes ─────────────────
+
+
+def test_aggregate_excludes_thumbnail_log() -> None:
+    # Arrange
+    s = _snapshot(
+        0,
+        tile_cache=10,
+        tile_cache_write=20,
+        fdr_output=30,
+        thumbnail_log=999,  # NOT in aggregate per AC-1 scope
+    )
+
+    # Assert
+    assert s.aggregate_bytes == 60
+
+
+# ───────────────────────── evaluate ────────────────────────────
+
+
+def test_evaluate_under_aggregate_and_thumbnail_budgets_passes() -> None:
+    # Arrange — end-of-run aggregate = 50 GiB; thumb @ 30 min = 0.05 GiB → 8h = 0.8 GiB
+    samples = [
+        _snapshot(0, tile_cache=0),
+        _snapshot(
+            30 * 60_000,
+            tile_cache=20 * GIB,
+            tile_cache_write=10 * GIB,
+            fdr_output=20 * GIB,
+            thumbnail_log=int(0.05 * GIB),
+        ),
+    ]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert
+    assert report.passes_aggregate
+    assert report.passes_thumbnail_log
+    assert report.passes
+
+
+def test_evaluate_aggregate_breach_fails_ac_1() -> None:
+    # Arrange — aggregate = 101 GiB > 100 GiB budget
+    samples = [
+        _snapshot(
+            30 * 60_000,
+            tile_cache=40 * GIB,
+            tile_cache_write=30 * GIB,
+            fdr_output=31 * GIB,
+            thumbnail_log=0,
+        )
+    ]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert
+    assert not report.passes_aggregate
+    assert not report.passes
+
+
+def test_evaluate_aggregate_at_budget_passes_ac_1() -> None:
+    # Arrange — aggregate = 100 GiB exactly; "≤" means PASS
+    samples = [
+        _snapshot(
+            30 * 60_000,
+            tile_cache=40 * GIB,
+            tile_cache_write=30 * GIB,
+            fdr_output=30 * GIB,
+        )
+    ]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert
+    assert report.passes_aggregate
+
+
+def test_evaluate_thumbnail_log_strict_lt_fails_at_budget() -> None:
+    # Arrange — thumb @ 30 min produces extrapolated 1 GiB exactly; AC-2 is strict "<"
+    target_30min = sbe.THUMBNAIL_LOG_BUDGET_BYTES // 16
+    samples = [_snapshot(30 * 60_000, thumbnail_log=target_30min)]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert — extrapolated equals budget → AC-2 fails (strict <)
+    assert report.thumbnail_log_extrapolated_8h_bytes == sbe.THUMBNAIL_LOG_BUDGET_BYTES
+    assert not report.passes_thumbnail_log
+
+
+def test_evaluate_thumbnail_log_just_under_budget_passes() -> None:
+    # Arrange — slightly under
+    target_30min = (sbe.THUMBNAIL_LOG_BUDGET_BYTES - 1024) // 16
+    samples = [_snapshot(30 * 60_000, thumbnail_log=target_30min)]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert
+    assert report.passes_thumbnail_log
+
+
+def test_evaluate_uses_end_of_run_snapshot_not_max() -> None:
+    # Arrange — peak in the middle, smaller at end → AC-1 evaluates end
+    samples = [
+        _snapshot(
+            10 * 60_000,
+            tile_cache=200 * GIB,
+            tile_cache_write=0,
+            fdr_output=0,
+        ),
+        _snapshot(
+            30 * 60_000,
+            tile_cache=10 * GIB,
+            tile_cache_write=10 * GIB,
+            fdr_output=10 * GIB,
+        ),
+    ]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert — uses last snapshot (sorted by monotonic_ms)
+    assert report.aggregate_at_end_bytes == 30 * GIB
+    assert report.passes_aggregate
+
+
+def test_evaluate_unsorted_samples_are_sorted_before_end_pick() -> None:
+    # Arrange
+    samples = [
+        _snapshot(30 * 60_000, tile_cache=1 * GIB),
+        _snapshot(0, tile_cache=200 * GIB),
+    ]
+
+    # Act
+    report = sbe.evaluate(samples)
+
+    # Assert — end snapshot is the t=30 min one
+    assert report.aggregate_at_end_bytes == 1 * GIB
+
+
+def test_evaluate_empty_samples_returns_none_stats() -> None:
+    # Act
+    report = sbe.evaluate([])
+
+    # Assert
+    assert report.sample_count == 0
+    assert report.aggregate_at_end_bytes is None
+    assert report.thumbnail_log_at_end_bytes is None
+    assert report.thumbnail_log_extrapolated_8h_bytes is None
+    assert not report.passes
+
+
+def test_evaluate_rejects_non_positive_budgets() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        sbe.evaluate([], aggregate_budget_bytes=0)
+    with pytest.raises(ValueError):
+        sbe.evaluate([], thumbnail_log_budget_bytes=0)
+
+
+# ───────────────────────── CSV evidence ─────────────────────────
+
+
+def test_write_csv_evidence_writes_aggregate_and_thumbnail(tmp_path: Path) -> None:
+    # Arrange
+    samples = [
+        _snapshot(
+            30 * 60_000,
+            tile_cache=10 * GIB,
+            tile_cache_write=10 * GIB,
+            fdr_output=10 * GIB,
+            thumbnail_log=int(0.05 * GIB),
+        )
+    ]
+    report = sbe.evaluate(samples)
+    out = tmp_path / "report.csv"
+
+    # Act
+    sbe.write_csv_evidence(out, report)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "sample_count"
+    assert rows[1][-1] == "true"
+
+
+def test_write_per_minute_csv_orders_by_timestamp(tmp_path: Path) -> None:
+    # Arrange
+    samples = [
+        _snapshot(30 * 60_000, tile_cache=10 * GIB),
+        _snapshot(0, tile_cache=0),
+    ]
+    out = tmp_path / "per-min.csv"
+
+    # Act
+    sbe.write_per_minute_csv(out, samples)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    timestamps = [int(r[1]) for r in rows[1:]]
+    assert timestamps == sorted(timestamps)
@@ -0,0 +1,260 @@
+"""Unit tests for ``runner.helpers.thermal_envelope_evaluator`` (AZ-443 / NFT-LIM-04)."""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+
+import pytest
+
+from runner.helpers import thermal_envelope_evaluator as tee
+
+
+# ───────────────────────── ThermalThresholds ─────────────────────────
+
+
+def test_default_thresholds_match_orin_nano_super() -> None:
+    # Arrange / Act
+    t = tee.ThermalThresholds()
+
+    # Assert
+    assert t.cpu_t_throttle_c == 97.0
+    assert t.soc_t_throttle_c == 95.0
+    assert t.cpu_budget_c == 97.0 - tee.HEADROOM_C
+    assert t.soc_budget_c == 95.0 - tee.HEADROOM_C
+
+
+def test_load_from_fixture_round_trip(tmp_path: Path) -> None:
+    # Arrange
+    payload = {"cpu_t_throttle_c": 99.5, "soc_t_throttle_c": 92.0}
+    fixture = tmp_path / "thermal.json"
+    fixture.write_text(json.dumps(payload))
+
+    # Act
+    t = tee.ThermalThresholds.load_from_fixture(fixture)
+
+    # Assert
+    assert t.cpu_t_throttle_c == 99.5
+    assert t.soc_t_throttle_c == 92.0
+
+
+def test_load_from_fixture_rejects_non_object(tmp_path: Path) -> None:
+    # Arrange
+    fixture = tmp_path / "thermal.json"
+    fixture.write_text("[1, 2, 3]")
+
+    # Assert
+    with pytest.raises(ValueError):
+        tee.ThermalThresholds.load_from_fixture(fixture)
+
+
+def test_load_from_fixture_rejects_missing_key(tmp_path: Path) -> None:
+    # Arrange — missing soc_t_throttle_c
+    fixture = tmp_path / "thermal.json"
+    fixture.write_text(json.dumps({"cpu_t_throttle_c": 97.0}))
+
+    # Assert
+    with pytest.raises(ValueError):
+        tee.ThermalThresholds.load_from_fixture(fixture)
+
+
+def test_load_from_fixture_rejects_non_numeric(tmp_path: Path) -> None:
+    # Arrange
+    fixture = tmp_path / "thermal.json"
+    fixture.write_text(
+        json.dumps({"cpu_t_throttle_c": "hot", "soc_t_throttle_c": 95.0})
+    )
+
+    # Assert
+    with pytest.raises(ValueError):
+        tee.ThermalThresholds.load_from_fixture(fixture)
+
+
+# ───────────────────────── _percentile_float ─────────────────────
+
+
+def test_percentile_float_q_must_be_in_range() -> None:
+    # Assert
+    with pytest.raises(ValueError):
+        tee._percentile_float([10.0], -0.1)
+    with pytest.raises(ValueError):
+        tee._percentile_float([10.0], 100.1)
+
+
+def test_percentile_float_empty_returns_none() -> None:
+    # Assert
+    assert tee._percentile_float([], 99.0) is None
+
+
+def test_percentile_float_single_value_returns_that_value() -> None:
+    # Assert
+    assert tee._percentile_float([55.0], 99.0) == 55.0
+
+
+def test_percentile_float_known_distribution() -> None:
+    # Arrange — 1..100 step 1
+    values = [float(i) for i in range(1, 101)]
+
+    # Assert — p99 = linear interp between values[98] and values[99] at rank 98.01
+    assert tee._percentile_float(values, 99.0) == pytest.approx(99.01)
+
+
+# ───────────────────────── evaluate ────────────────────────────
+
+
+def _cool_samples(n: int = 60, cpu_c: float = 60.0, soc_c: float = 55.0) -> list[tee.ThermalSample]:
+    return [
+        tee.ThermalSample(monotonic_ms=i * 1000, cpu_temp_c=cpu_c, soc_temp_c=soc_c)
+        for i in range(n)
+    ]
+
+
+def test_evaluate_cool_run_passes() -> None:
+    # Arrange
+    samples = _cool_samples()
+    thresholds = tee.ThermalThresholds()
+
+    # Act
+    report = tee.evaluate(samples, throttle_events=[], thresholds=thresholds)
+
+    # Assert
+    assert report.passes_no_throttle
+    assert report.passes_headroom
+    assert report.passes
+
+
+def test_evaluate_throttle_event_fails_ac_2() -> None:
+    # Arrange
+    samples = _cool_samples()
+    events = [tee.ThrottleEvent(monotonic_ms=42, snippet="thermal_throttle: zone CPU_thermal")]
+    thresholds = tee.ThermalThresholds()
+
+    # Act
+    report = tee.evaluate(samples, throttle_events=events, thresholds=thresholds)
+
+    # Assert
+    assert not report.passes_no_throttle
+    assert not report.passes
+
+
+def test_evaluate_cpu_above_budget_fails_ac_3() -> None:
+    # Arrange — CPU stuck near 95 °C; budget = 97 - 5 = 92 °C
+    samples = _cool_samples(cpu_c=95.0, soc_c=55.0)
+    thresholds = tee.ThermalThresholds()
+
+    # Act
+    report = tee.evaluate(samples, throttle_events=[], thresholds=thresholds)
+
+    # Assert
+    assert not report.passes_headroom
+    assert not report.passes
+
+
+def test_evaluate_soc_above_budget_fails_ac_3() -> None:
+    # Arrange — SoC near 92 °C; budget = 95 - 5 = 90 °C
+    samples = _cool_samples(cpu_c=60.0, soc_c=92.0)
+    thresholds = tee.ThermalThresholds()
+
+    # Act
+    report = tee.evaluate(samples, throttle_events=[], thresholds=thresholds)
+
+    # Assert
+    assert not report.passes_headroom
+
+
+def test_evaluate_cpu_p99_exactly_at_budget_passes() -> None:
+    # Arrange — flat run at exactly the budget
+    thresholds = tee.ThermalThresholds()
+    samples = _cool_samples(cpu_c=thresholds.cpu_budget_c, soc_c=thresholds.soc_budget_c)
+
+    # Act
+    report = tee.evaluate(samples, throttle_events=[], thresholds=thresholds)
+
+    # Assert — "≤" means at-budget passes
+    assert report.passes_headroom
+
+
+def test_evaluate_empty_samples_returns_none_p99_and_fails() -> None:
+    # Act
+    thresholds = tee.ThermalThresholds()
+    report = tee.evaluate([], throttle_events=[], thresholds=thresholds)
+
+    # Assert
+    assert report.cpu.p99_c is None
+    assert report.soc.p99_c is None
+    assert not report.passes_headroom
+
+
+# ───────────────────────── PARTIAL annotation ────────────────────
+
+
+def test_write_traceability_partial_creates_file(tmp_path: Path) -> None:
+    # Arrange
+    out = tmp_path / "traceability-status.json"
+
+    # Act
+    tee.write_traceability_partial_annotation(out)
+
+    # Assert
+    payload = json.loads(out.read_text())
+    assert payload["AC-NEW-5"] == "PARTIAL — chamber required for full"
+
+
+def test_write_traceability_partial_merges_existing(tmp_path: Path) -> None:
+    # Arrange
+    out = tmp_path / "traceability-status.json"
+    out.write_text(json.dumps({"AC-OTHER": "covered"}))
+
+    # Act
+    tee.write_traceability_partial_annotation(out)
+
+    # Assert
+    payload = json.loads(out.read_text())
+    assert payload["AC-OTHER"] == "covered"
+    assert payload["AC-NEW-5"] == "PARTIAL — chamber required for full"
+
+
+def test_write_traceability_partial_rejects_non_object_existing(tmp_path: Path) -> None:
+    # Arrange
+    out = tmp_path / "traceability-status.json"
+    out.write_text("[1, 2]")
+
+    # Assert
+    with pytest.raises(ValueError):
+        tee.write_traceability_partial_annotation(out)
+
+
+# ───────────────────────── CSV evidence ─────────────────────────
+
+
+def test_write_csv_evidence_one_row(tmp_path: Path) -> None:
+    # Arrange
+    samples = _cool_samples()
+    thresholds = tee.ThermalThresholds()
+    report = tee.evaluate(samples, throttle_events=[], thresholds=thresholds)
+    out = tmp_path / "report.csv"
+
+    # Act
+    tee.write_csv_evidence(out, report)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert rows[0][0] == "cpu_t_throttle_c"
+    assert rows[1][-1] == "true"
+
+
+def test_write_throttle_events_csv_truncates_long_snippet(tmp_path: Path) -> None:
+    # Arrange
+    long_snippet = "T" * 500
+    events = [tee.ThrottleEvent(monotonic_ms=1, snippet=long_snippet)]
+    out = tmp_path / "throttle.csv"
+
+    # Act
+    tee.write_throttle_events_csv(out, events)
+
+    # Assert
+    with out.open() as fh:
+        rows = list(csv.reader(fh))
+    assert len(rows[1][2]) == 200