[AZ-446] CSV reporter: band + ci95 annotations + report.csv emitter

Batch 89 — adds optional `band`, `ci95_low`, `ci95_high` kw-only
parameters to `_NfrRecorder.record_metric` and emits a new per-metric
report.csv artifact (one row per scenario × metric, columns:
scenario_id, metric_name, value, value_band, ci95_low, ci95_high,
ac_id, outcome). Backwards compatible — existing 4-arg callers
unchanged; unbalanced ci95 pair raises ValueError. report.csv is
written once per pytest session from `pytest_sessionfinish` so the
annotation pass runs once per CI invocation regardless of
(fc_adapter, vio_strategy) (AC-3). `regression-baseline.json`
intentionally kept flat to preserve the diff contract used by
regression-detection tooling.

NFT-RES-03 + NFT-PERF-01 scenarios updated to pass real bands and
compute empirical 2.5/97.5-percentile ci95 from their own sample
streams (per-iteration envelope ratios for Monte Carlo,
per-frame latency samples for N-sample latency).

Tests: 1229 e2e/_unit_tests pass (+6 vs. batch 88 for AZ-446
band/CI behavior, value-error on unbalanced ci95, report.csv columns,
explicit-path override, and end-to-end emission via the pytest
plugin). Code review: PASS_WITH_WARNINGS — 1 Low (empirical-CI
semantics, documented inline), 1 Medium carried over from batch 88's
cumulative-review backlog (write_csv_evidence + _resolve_fixture_path
duplication is outside AZ-446 reporting scope).

This commit closes Step 10 Implement Tests for cycle 1 (41 of 41
blackbox-test tasks done, AZ-406..AZ-446). Greenfield auto-chains to
Step 11 Run Tests next.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-17 18:14:00 +03:00
parent 6e4a575221
commit 33e683dc0f
8 changed files with 595 additions and 13 deletions
@@ -136,10 +136,19 @@ def test_nft_perf_01_e2e_latency(
f"nft_perf_01.{r.config_id}.frame_drop_ratio",
float(r.frame_drop_ratio),
ac_id="AC-4",
band=f"{r.frame_drop_budget:.2f}",
)
# AZ-446 AC-2 — CI95 columns derive from the empirical 2.5 / 97.5
# percentile of the underlying per-frame latency samples (N≥900).
latencies = [s.latency_ms for s in r.samples]
ci_low, ci_high = _percentile_pair(latencies, 2.5, 97.5)
if r.p50_ms is not None:
nfr_recorder.record_metric(
f"nft_perf_01.{r.config_id}.latency_ms_p50", float(r.p50_ms)
f"nft_perf_01.{r.config_id}.latency_ms_p50",
float(r.p50_ms),
band="(median, no budget)",
ci95_low=ci_low,
ci95_high=ci_high,
)
if r.p95_ms is not None:
ac_id = "AC-3" if r.config_id == "k2-hybrid-50c" else "AC-2"
@@ -147,10 +156,17 @@ def test_nft_perf_01_e2e_latency(
f"nft_perf_01.{r.config_id}.latency_ms_p95",
float(r.p95_ms),
ac_id=ac_id,
band=f"{r.p95_budget_ms:.0f} ms",
ci95_low=ci_low,
ci95_high=ci_high,
)
if r.p99_ms is not None:
nfr_recorder.record_metric(
f"nft_perf_01.{r.config_id}.latency_ms_p99", float(r.p99_ms)
f"nft_perf_01.{r.config_id}.latency_ms_p99",
float(r.p99_ms),
band="(p99, no budget)",
ci95_low=ci_low,
ci95_high=ci_high,
)
breaches = []
@@ -170,6 +186,31 @@ def test_nft_perf_01_e2e_latency(
assert not breaches, "\n".join(breaches)
def _percentile_pair(
values: list[float], q_low: float, q_high: float
) -> tuple[float | None, float | None]:
"""Linear-interpolation percentile pair (AZ-446 CI95 helper).
Returns ``(None, None)`` for empty input. Used to project the
empirical 95% interval (2.5th / 97.5th percentile) of the
underlying latency samples onto the recorded percentile metrics.
"""
if not values:
return None, None
ordered = sorted(values)
if len(ordered) == 1:
return float(ordered[0]), float(ordered[0])
def _at(q: float) -> float:
rank = (q / 100.0) * (len(ordered) - 1)
lo = int(rank)
hi = min(lo + 1, len(ordered) - 1)
frac = rank - lo
return float(ordered[lo] + (ordered[hi] - ordered[lo]) * frac)
return _at(q_low), _at(q_high)
def _resolve_latency_fixture_path() -> Path:
from runner.helpers import sitl_observer
@@ -115,14 +115,26 @@ def test_nft_res_03_monte_carlo(
)
nfr_recorder.record_metric(
"nft_res_03.iteration_count", float(report1.iteration_count), ac_id="AC-1"
"nft_res_03.iteration_count",
float(report1.iteration_count),
ac_id="AC-1",
band=f"{report1.min_iteration_count} iterations",
)
nfr_recorder.record_metric(
"nft_res_03.total_samples", float(report1.total_samples)
)
if report1.envelope_ratio is not None:
# AZ-446 AC-2 — per-iteration envelope ratios provide the empirical
# 95% interval (2.5th / 97.5th percentile across 100 iterations).
per_iter_ratios = _per_iteration_envelope_ratios(report1)
ci_low, ci_high = _percentile_pair(per_iter_ratios, 2.5, 97.5)
nfr_recorder.record_metric(
"nft_res_03.envelope_ratio", float(report1.envelope_ratio), ac_id="AC-3"
"nft_res_03.envelope_ratio",
float(report1.envelope_ratio),
ac_id="AC-3",
band=f"{report1.envelope_ratio_budget:.2f}",
ci95_low=ci_low,
ci95_high=ci_high,
)
nfr_recorder.record_metric(
"nft_res_03.master_seed", float(report1.master_seed)
@@ -147,6 +159,41 @@ def _full_matrix_enabled() -> bool:
return os.environ.get(NFT_RES_03_FULL_MATRIX_ENV_VAR, "").strip() in {"1", "true", "yes"}
def _per_iteration_envelope_ratios(report: mce.MonteCarloReport) -> list[float]:
"""Per-iteration ``covered/frames`` ratios (AZ-446 CI95 input)."""
ratios: list[float] = []
for it in report.iterations:
if not it.samples:
continue
covered = sum(
1
for s in it.samples
if s.error_m <= mce.ENVELOPE_MULTIPLIER * s.cov_semi_major_m
)
ratios.append(covered / len(it.samples))
return ratios
def _percentile_pair(
values: list[float], q_low: float, q_high: float
) -> tuple[float | None, float | None]:
"""Linear-interpolation percentiles. Returns ``(None, None)`` if empty."""
if not values:
return None, None
ordered = sorted(values)
if len(ordered) == 1:
return float(ordered[0]), float(ordered[0])
def _at(q: float) -> float:
rank = (q / 100.0) * (len(ordered) - 1)
lo = int(rank)
hi = min(lo + 1, len(ordered) - 1)
frac = rank - lo
return float(ordered[lo] + (ordered[hi] - ordered[lo]) * frac)
return _at(q_low), _at(q_high)
def _resolve_fixture_path() -> Path:
raw = os.environ.get(NFT_RES_03_FIXTURE_ENV_VAR, "").strip()
from runner.helpers import sitl_observer