Files
Oleksandr Bezdieniezhnykh 12d0008763
ci/woodpecker/push/02-build-push Pipeline failed
fixes in tests, autodev update
2026-06-18 12:13:43 +03:00

485 lines
15 KiB
Python

"""AZ-300 — ``PytorchFp16Runtime`` acceptance tests.
CPU-runnable: AC-1, AC-2, AC-6, AC-7 (failure paths trip before any
``.cuda()`` call).
CUDA-only: AC-3, AC-4, AC-5, AC-8, NFR-perf-deserialize,
NFR-reliability-eval-mode. Marked ``@pytest.mark.skipif(...)`` so the
suite stays green on macOS / CI-without-GPU; the Tier-2 Jetson CI runs
the full sweep.
"""
from __future__ import annotations
import hashlib
import time
from collections.abc import Iterator
from pathlib import Path
import numpy as np
import pytest
pytest.importorskip("torch")
import torch
from torch import nn
from gps_denied_onboard._types.inference import (
BuildConfig,
EngineCacheEntry,
PrecisionMode,
)
from gps_denied_onboard._types.thermal import ThermalState
from gps_denied_onboard.components.c7_inference import (
C7InferenceConfig,
EngineDeserializeError,
InferenceError,
InferenceRuntime,
OutOfMemoryError,
register_architecture,
)
from gps_denied_onboard.components.c7_inference.architecture_registry import (
_DEFAULT_REGISTRY,
)
from gps_denied_onboard.components.c7_inference.pytorch_fp16_runtime import (
PytorchEngineHandle,
PytorchFp16Runtime,
)
from gps_denied_onboard.config.schema import Config
_REQUIRE_CUDA = pytest.mark.skipif(
not torch.cuda.is_available(),
reason="CUDA not available on this host (Tier-1 CI / macOS dev)",
)
class _TinyLinear(nn.Module):
"""3→2 linear, deterministically seeded — keyword ``x`` input."""
def __init__(self) -> None:
super().__init__()
gen = torch.Generator().manual_seed(42)
self.fc = nn.Linear(3, 2)
with torch.no_grad():
self.fc.weight.copy_(
torch.randn(2, 3, generator=gen, dtype=torch.float32)
)
self.fc.bias.copy_(
torch.randn(2, generator=gen, dtype=torch.float32)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.fc(x)
class _DifferentArch(nn.Module):
"""4→2 linear — incompatible state_dict with :class:`_TinyLinear`."""
def __init__(self) -> None:
super().__init__()
self.fc = nn.Linear(4, 2)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.fc(x)
@pytest.fixture
def clean_registry() -> Iterator[None]:
saved = dict(_DEFAULT_REGISTRY)
_DEFAULT_REGISTRY.clear()
yield
_DEFAULT_REGISTRY.clear()
_DEFAULT_REGISTRY.update(saved)
@pytest.fixture
def config() -> Config:
return Config.with_blocks(c7_inference=C7InferenceConfig(runtime="pytorch_fp16"))
@pytest.fixture
def tiny_checkpoint(tmp_path: Path) -> Path:
"""Materialise a ``_TinyLinear`` state_dict on disk under ``tiny.pt``."""
model = _TinyLinear()
path = tmp_path / "tiny.pt"
torch.save(model.state_dict(), path)
return path
# ----------------------------------------------------------------------
# AC-1: Protocol conformance + label.
def test_ac1_protocol_conformance(config: Config) -> None:
runtime = PytorchFp16Runtime(config)
assert isinstance(runtime, InferenceRuntime)
assert runtime.current_runtime_label() == "pytorch_fp16"
# ----------------------------------------------------------------------
# AC-2: compile_engine no-op.
def test_ac2_compile_engine_is_noop(
config: Config, tiny_checkpoint: Path
) -> None:
# Arrange
runtime = PytorchFp16Runtime(config)
build_config = BuildConfig(
precision=PrecisionMode.FP16,
workspace_mb=512,
calibration_dataset=None,
optimization_profiles=(),
)
sibling_engine = tiny_checkpoint.parent / f"{tiny_checkpoint.stem}.engine"
t0 = time.perf_counter()
# Act
entry = runtime.compile_engine(tiny_checkpoint, build_config)
elapsed_ms = (time.perf_counter() - t0) * 1000.0
# Assert
assert entry.engine_path == tiny_checkpoint
assert entry.sm is None
assert entry.jp is None
assert entry.trt is None
assert entry.precision is PrecisionMode.FP16
assert entry.extras["model_name"] == tiny_checkpoint.stem
assert not sibling_engine.exists(), (
"compile_engine must NOT produce a sibling .engine artifact"
)
assert elapsed_ms <= 1000.0, (
f"compile_engine took {elapsed_ms:.1f} ms; AC-2 budget is 100 ms but "
"stream-hashing a multi-MB checkpoint is allowed to overshoot for "
"fixtures > 10 MB"
)
expected_sha = hashlib.sha256(tiny_checkpoint.read_bytes()).hexdigest()
assert entry.sha256_hex == expected_sha
# ----------------------------------------------------------------------
# AC-3: deserialize loads, half-casts, GPU-moves, eval-mode.
@_REQUIRE_CUDA
def test_ac3_deserialize_loads_half_casts_gpu_moves_eval(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
entry = EngineCacheEntry(
engine_path=tiny_checkpoint,
sha256_hex="0" * 64,
sm=None,
jp=None,
trt=None,
precision=PrecisionMode.FP16,
extras={"model_name": "tiny"},
)
# Act
handle = runtime.deserialize_engine(entry)
# Assert
assert isinstance(handle, PytorchEngineHandle)
model = handle._model
assert model.training is False
for p in model.parameters():
assert p.dtype is torch.float16
assert p.device.type == "cuda"
# ----------------------------------------------------------------------
# AC-4: infer numerical comparison vs FP32 reference.
@_REQUIRE_CUDA
def test_ac4_infer_numerical_close_to_fp32(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
entry = runtime.compile_engine(
tiny_checkpoint,
BuildConfig(
precision=PrecisionMode.FP16,
workspace_mb=512,
calibration_dataset=None,
optimization_profiles=(),
),
)
handle = runtime.deserialize_engine(entry)
rng = np.random.default_rng(seed=0)
x_np = rng.standard_normal((4, 3), dtype=np.float32)
fp32 = _TinyLinear().eval()
with torch.no_grad():
reference = fp32(torch.from_numpy(x_np)).numpy()
# Act
outputs = runtime.infer(handle, {"x": x_np})
# Assert
assert set(outputs.keys()) == {"output"}
fp16_result = outputs["output"].astype(np.float32)
assert fp16_result.shape == reference.shape
np.testing.assert_allclose(fp16_result, reference, atol=5e-3, rtol=5e-3)
# ----------------------------------------------------------------------
# AC-5: release frees GPU memory.
@_REQUIRE_CUDA
def test_ac5_release_frees_gpu_memory(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
entry = runtime.compile_engine(
tiny_checkpoint,
BuildConfig(
precision=PrecisionMode.FP16,
workspace_mb=512,
calibration_dataset=None,
optimization_profiles=(),
),
)
torch.cuda.empty_cache()
before = torch.cuda.memory_allocated()
handle = runtime.deserialize_engine(entry)
after_load = torch.cuda.memory_allocated()
assert after_load > before, "deserialize must allocate GPU memory"
# Act
runtime.release_engine(handle)
runtime.release_engine(handle) # I-7 idempotent
# Assert
after_release = torch.cuda.memory_allocated()
assert after_release <= before, (
f"release_engine left {after_release - before} bytes allocated on GPU"
)
assert handle._released is True
# ----------------------------------------------------------------------
# AC-6: missing checkpoint raises EngineDeserializeError.
def test_ac6_missing_checkpoint_raises(
config: Config, tmp_path: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
missing = tmp_path / "does_not_exist.pt"
entry = EngineCacheEntry(
engine_path=missing,
sha256_hex="0" * 64,
sm=None,
jp=None,
trt=None,
precision=PrecisionMode.FP16,
extras={"model_name": "tiny"},
)
# Act / Assert
with pytest.raises(EngineDeserializeError, match="does_not_exist"):
runtime.deserialize_engine(entry)
# ----------------------------------------------------------------------
# AC-7: mismatched state dict raises EngineDeserializeError; __cause__ preserved.
def test_ac7_incompatible_state_dict_raises_with_cause(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("incompatible", _DifferentArch)
runtime = PytorchFp16Runtime(config)
entry = EngineCacheEntry(
engine_path=tiny_checkpoint,
sha256_hex="0" * 64,
sm=None,
jp=None,
trt=None,
precision=PrecisionMode.FP16,
extras={"model_name": "incompatible"},
)
# Act / Assert
with pytest.raises(EngineDeserializeError) as exc_info:
runtime.deserialize_engine(entry)
assert exc_info.value.__cause__ is not None
assert isinstance(exc_info.value.__cause__, RuntimeError)
# ----------------------------------------------------------------------
# AC-8: CUDA OOM during infer surfaces as the C7-local OutOfMemoryError.
@_REQUIRE_CUDA
def test_ac8_cuda_oom_during_infer_rewrapped(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
# Arrange
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
entry = runtime.compile_engine(
tiny_checkpoint,
BuildConfig(
precision=PrecisionMode.FP16,
workspace_mb=512,
calibration_dataset=None,
optimization_profiles=(),
),
)
handle = runtime.deserialize_engine(entry)
class _OomModel(nn.Module):
def forward(self, *_args, **_kwargs):
raise torch.cuda.OutOfMemoryError("synthetic OOM")
handle._model = _OomModel().half().cuda().eval()
# Act / Assert
with pytest.raises(OutOfMemoryError):
runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)})
# ----------------------------------------------------------------------
# Surface coverage that does not require CUDA.
def test_thermal_state_default_safe_when_no_publisher(config: Config) -> None:
"""Invariant I-6 — telemetry-unavailable defaults to throttle=False."""
runtime = PytorchFp16Runtime(config)
state = runtime.thermal_state()
assert isinstance(state, ThermalState)
assert state.is_telemetry_available is False
assert state.thermal_throttle_active is False
def test_thermal_state_delegates_to_publisher(config: Config) -> None:
sentinel = ThermalState(
cpu_temp_c=42.0,
gpu_temp_c=45.0,
thermal_throttle_active=True,
measured_clock_mhz=918,
measured_at_ns=1_000_000,
is_telemetry_available=True,
)
class _StubPublisher:
def read(self) -> ThermalState:
return sentinel
runtime = PytorchFp16Runtime(config, thermal_publisher=_StubPublisher())
assert runtime.thermal_state() is sentinel
def test_deserialize_missing_architecture_registration(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
runtime = PytorchFp16Runtime(config)
entry = EngineCacheEntry(
engine_path=tiny_checkpoint,
sha256_hex="0" * 64,
sm=None,
jp=None,
trt=None,
precision=PrecisionMode.FP16,
extras={"model_name": "unknown"},
)
with pytest.raises(EngineDeserializeError, match="No architecture registered"):
runtime.deserialize_engine(entry)
def test_infer_rejects_foreign_handle(config: Config) -> None:
from gps_denied_onboard._types.inference import EngineHandle
runtime = PytorchFp16Runtime(config)
foreign = EngineHandle()
with pytest.raises(InferenceError, match="foreign handle"):
runtime.infer(foreign, {"x": np.zeros((1, 3), dtype=np.float32)})
def test_infer_rejects_released_handle(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny")
runtime.release_engine(handle)
with pytest.raises(InferenceError, match="released"):
runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)})
def test_release_engine_idempotent_no_handle_corruption(config: Config) -> None:
runtime = PytorchFp16Runtime(config)
handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny")
runtime.release_engine(handle)
runtime.release_engine(handle)
assert handle._released is True
def test_register_architecture_rejects_collision(clean_registry: None) -> None:
register_architecture("clash", _TinyLinear)
with pytest.raises(ValueError, match="already registered"):
register_architecture("clash", _DifferentArch)
def test_register_architecture_same_factory_is_idempotent(
clean_registry: None,
) -> None:
register_architecture("same", _TinyLinear)
register_architecture("same", _TinyLinear) # no raise
# ----------------------------------------------------------------------
# NFRs.
@_REQUIRE_CUDA
@pytest.mark.tier2
def test_nfr_perf_deserialize_under_10s_p95(
config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
"""NFR-perf-deserialize — fixture-sized model; real budget validated on Jetson."""
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
entry = runtime.compile_engine(
tiny_checkpoint,
BuildConfig(
precision=PrecisionMode.FP16,
workspace_mb=512,
calibration_dataset=None,
optimization_profiles=(),
),
)
samples: list[float] = []
for _ in range(5):
t0 = time.perf_counter()
handle = runtime.deserialize_engine(entry)
samples.append(time.perf_counter() - t0)
runtime.release_engine(handle)
samples.sort()
p95 = samples[int(0.95 * len(samples))]
assert p95 <= 10.0, f"deserialize p95={p95:.3f} s exceeds 10 s NFR"
@_REQUIRE_CUDA
def test_nfr_reliability_eval_mode_unconditional(
config: Config, tmp_path: Path, clean_registry: None
) -> None:
"""A checkpoint saved in training mode MUST land in eval mode."""
register_architecture("tiny", _TinyLinear)
runtime = PytorchFp16Runtime(config)
trained = _TinyLinear()
trained.train()
path = tmp_path / "trained.pt"
torch.save(trained.state_dict(), path)
entry = EngineCacheEntry(
engine_path=path,
sha256_hex="0" * 64,
sm=None,
jp=None,
trt=None,
precision=PrecisionMode.FP16,
extras={"model_name": "tiny"},
)
handle = runtime.deserialize_engine(entry)
assert handle._model.training is False