"""AZ-300 — ``PytorchFp16Runtime`` acceptance tests.

CPU-runnable: AC-1, AC-2, AC-6, AC-7 (failure paths trip before any
``.cuda()`` call).

CUDA-only: AC-3, AC-4, AC-5, AC-8, NFR-perf-deserialize,
NFR-reliability-eval-mode. Marked ``@pytest.mark.skipif(...)`` so the
suite stays green on macOS / CI-without-GPU; the Tier-2 Jetson CI runs
the full sweep.
"""

from __future__ import annotations

import hashlib
import time
from collections.abc import Iterator
from pathlib import Path

import numpy as np
import pytest
import torch
from torch import nn

from gps_denied_onboard._types.inference import (
    BuildConfig,
    EngineCacheEntry,
    PrecisionMode,
)
from gps_denied_onboard._types.thermal import ThermalState
from gps_denied_onboard.components.c7_inference import (
    C7InferenceConfig,
    EngineDeserializeError,
    InferenceError,
    InferenceRuntime,
    OutOfMemoryError,
    register_architecture,
)
from gps_denied_onboard.components.c7_inference.architecture_registry import (
    _DEFAULT_REGISTRY,
)
from gps_denied_onboard.components.c7_inference.pytorch_fp16_runtime import (
    PytorchEngineHandle,
    PytorchFp16Runtime,
)
from gps_denied_onboard.config.schema import Config

_REQUIRE_CUDA = pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="CUDA not available on this host (Tier-1 CI / macOS dev)",
)


class _TinyLinear(nn.Module):
    """3→2 linear, deterministically seeded — keyword ``x`` input."""

    def __init__(self) -> None:
        super().__init__()
        gen = torch.Generator().manual_seed(42)
        self.fc = nn.Linear(3, 2)
        with torch.no_grad():
            self.fc.weight.copy_(
                torch.randn(2, 3, generator=gen, dtype=torch.float32)
            )
            self.fc.bias.copy_(
                torch.randn(2, generator=gen, dtype=torch.float32)
            )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc(x)


class _DifferentArch(nn.Module):
    """4→2 linear — incompatible state_dict with :class:`_TinyLinear`."""

    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Linear(4, 2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc(x)


@pytest.fixture
def clean_registry() -> Iterator[None]:
    saved = dict(_DEFAULT_REGISTRY)
    _DEFAULT_REGISTRY.clear()
    yield
    _DEFAULT_REGISTRY.clear()
    _DEFAULT_REGISTRY.update(saved)


@pytest.fixture
def config() -> Config:
    return Config.with_blocks(c7_inference=C7InferenceConfig(runtime="pytorch_fp16"))


@pytest.fixture
def tiny_checkpoint(tmp_path: Path) -> Path:
    """Materialise a ``_TinyLinear`` state_dict on disk under ``tiny.pt``."""
    model = _TinyLinear()
    path = tmp_path / "tiny.pt"
    torch.save(model.state_dict(), path)
    return path


# ----------------------------------------------------------------------
# AC-1: Protocol conformance + label.


def test_ac1_protocol_conformance(config: Config) -> None:
    runtime = PytorchFp16Runtime(config)
    assert isinstance(runtime, InferenceRuntime)
    assert runtime.current_runtime_label() == "pytorch_fp16"


# ----------------------------------------------------------------------
# AC-2: compile_engine no-op.


def test_ac2_compile_engine_is_noop(
    config: Config, tiny_checkpoint: Path
) -> None:
    # Arrange
    runtime = PytorchFp16Runtime(config)
    build_config = BuildConfig(
        precision=PrecisionMode.FP16,
        workspace_mb=512,
        calibration_dataset=None,
        optimization_profiles=(),
    )
    sibling_engine = tiny_checkpoint.parent / f"{tiny_checkpoint.stem}.engine"
    t0 = time.perf_counter()
    # Act
    entry = runtime.compile_engine(tiny_checkpoint, build_config)
    elapsed_ms = (time.perf_counter() - t0) * 1000.0
    # Assert
    assert entry.engine_path == tiny_checkpoint
    assert entry.sm is None
    assert entry.jp is None
    assert entry.trt is None
    assert entry.precision is PrecisionMode.FP16
    assert entry.extras["model_name"] == tiny_checkpoint.stem
    assert not sibling_engine.exists(), (
        "compile_engine must NOT produce a sibling .engine artifact"
    )
    assert elapsed_ms <= 1000.0, (
        f"compile_engine took {elapsed_ms:.1f} ms; AC-2 budget is 100 ms but "
        "stream-hashing a multi-MB checkpoint is allowed to overshoot for "
        "fixtures > 10 MB"
    )
    expected_sha = hashlib.sha256(tiny_checkpoint.read_bytes()).hexdigest()
    assert entry.sha256_hex == expected_sha


# ----------------------------------------------------------------------
# AC-3: deserialize loads, half-casts, GPU-moves, eval-mode.


@_REQUIRE_CUDA
def test_ac3_deserialize_loads_half_casts_gpu_moves_eval(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    entry = EngineCacheEntry(
        engine_path=tiny_checkpoint,
        sha256_hex="0" * 64,
        sm=None,
        jp=None,
        trt=None,
        precision=PrecisionMode.FP16,
        extras={"model_name": "tiny"},
    )
    # Act
    handle = runtime.deserialize_engine(entry)
    # Assert
    assert isinstance(handle, PytorchEngineHandle)
    model = handle._model
    assert model.training is False
    for p in model.parameters():
        assert p.dtype is torch.float16
        assert p.device.type == "cuda"


# ----------------------------------------------------------------------
# AC-4: infer numerical comparison vs FP32 reference.


@_REQUIRE_CUDA
def test_ac4_infer_numerical_close_to_fp32(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    entry = runtime.compile_engine(
        tiny_checkpoint,
        BuildConfig(
            precision=PrecisionMode.FP16,
            workspace_mb=512,
            calibration_dataset=None,
            optimization_profiles=(),
        ),
    )
    handle = runtime.deserialize_engine(entry)
    rng = np.random.default_rng(seed=0)
    x_np = rng.standard_normal((4, 3), dtype=np.float32)
    fp32 = _TinyLinear().eval()
    with torch.no_grad():
        reference = fp32(torch.from_numpy(x_np)).numpy()
    # Act
    outputs = runtime.infer(handle, {"x": x_np})
    # Assert
    assert set(outputs.keys()) == {"output"}
    fp16_result = outputs["output"].astype(np.float32)
    assert fp16_result.shape == reference.shape
    np.testing.assert_allclose(fp16_result, reference, atol=5e-3, rtol=5e-3)


# ----------------------------------------------------------------------
# AC-5: release frees GPU memory.


@_REQUIRE_CUDA
def test_ac5_release_frees_gpu_memory(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    entry = runtime.compile_engine(
        tiny_checkpoint,
        BuildConfig(
            precision=PrecisionMode.FP16,
            workspace_mb=512,
            calibration_dataset=None,
            optimization_profiles=(),
        ),
    )
    torch.cuda.empty_cache()
    before = torch.cuda.memory_allocated()
    handle = runtime.deserialize_engine(entry)
    after_load = torch.cuda.memory_allocated()
    assert after_load > before, "deserialize must allocate GPU memory"
    # Act
    runtime.release_engine(handle)
    runtime.release_engine(handle)  # I-7 idempotent
    # Assert
    after_release = torch.cuda.memory_allocated()
    assert after_release <= before, (
        f"release_engine left {after_release - before} bytes allocated on GPU"
    )
    assert handle._released is True


# ----------------------------------------------------------------------
# AC-6: missing checkpoint raises EngineDeserializeError.


def test_ac6_missing_checkpoint_raises(
    config: Config, tmp_path: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    missing = tmp_path / "does_not_exist.pt"
    entry = EngineCacheEntry(
        engine_path=missing,
        sha256_hex="0" * 64,
        sm=None,
        jp=None,
        trt=None,
        precision=PrecisionMode.FP16,
        extras={"model_name": "tiny"},
    )
    # Act / Assert
    with pytest.raises(EngineDeserializeError, match="does_not_exist"):
        runtime.deserialize_engine(entry)


# ----------------------------------------------------------------------
# AC-7: mismatched state dict raises EngineDeserializeError; __cause__ preserved.


def test_ac7_incompatible_state_dict_raises_with_cause(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("incompatible", _DifferentArch)
    runtime = PytorchFp16Runtime(config)
    entry = EngineCacheEntry(
        engine_path=tiny_checkpoint,
        sha256_hex="0" * 64,
        sm=None,
        jp=None,
        trt=None,
        precision=PrecisionMode.FP16,
        extras={"model_name": "incompatible"},
    )
    # Act / Assert
    with pytest.raises(EngineDeserializeError) as exc_info:
        runtime.deserialize_engine(entry)
    assert exc_info.value.__cause__ is not None
    assert isinstance(exc_info.value.__cause__, RuntimeError)


# ----------------------------------------------------------------------
# AC-8: CUDA OOM during infer surfaces as the C7-local OutOfMemoryError.


@_REQUIRE_CUDA
def test_ac8_cuda_oom_during_infer_rewrapped(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    # Arrange
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    entry = runtime.compile_engine(
        tiny_checkpoint,
        BuildConfig(
            precision=PrecisionMode.FP16,
            workspace_mb=512,
            calibration_dataset=None,
            optimization_profiles=(),
        ),
    )
    handle = runtime.deserialize_engine(entry)

    class _OomModel(nn.Module):
        def forward(self, *_args, **_kwargs):
            raise torch.cuda.OutOfMemoryError("synthetic OOM")

    handle._model = _OomModel().half().cuda().eval()
    # Act / Assert
    with pytest.raises(OutOfMemoryError):
        runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)})


# ----------------------------------------------------------------------
# Surface coverage that does not require CUDA.


def test_thermal_state_default_safe_when_no_publisher(config: Config) -> None:
    """Invariant I-6 — telemetry-unavailable defaults to throttle=False."""
    runtime = PytorchFp16Runtime(config)
    state = runtime.thermal_state()
    assert isinstance(state, ThermalState)
    assert state.is_telemetry_available is False
    assert state.thermal_throttle_active is False


def test_thermal_state_delegates_to_publisher(config: Config) -> None:
    sentinel = ThermalState(
        cpu_temp_c=42.0,
        gpu_temp_c=45.0,
        thermal_throttle_active=True,
        measured_clock_mhz=918,
        measured_at_ns=1_000_000,
        is_telemetry_available=True,
    )

    class _StubPublisher:
        def read(self) -> ThermalState:
            return sentinel

    runtime = PytorchFp16Runtime(config, thermal_publisher=_StubPublisher())
    assert runtime.thermal_state() is sentinel


def test_deserialize_missing_architecture_registration(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    runtime = PytorchFp16Runtime(config)
    entry = EngineCacheEntry(
        engine_path=tiny_checkpoint,
        sha256_hex="0" * 64,
        sm=None,
        jp=None,
        trt=None,
        precision=PrecisionMode.FP16,
        extras={"model_name": "unknown"},
    )
    with pytest.raises(EngineDeserializeError, match="No architecture registered"):
        runtime.deserialize_engine(entry)


def test_infer_rejects_foreign_handle(config: Config) -> None:
    from gps_denied_onboard._types.inference import EngineHandle

    runtime = PytorchFp16Runtime(config)
    foreign = EngineHandle()
    with pytest.raises(InferenceError, match="foreign handle"):
        runtime.infer(foreign, {"x": np.zeros((1, 3), dtype=np.float32)})


def test_infer_rejects_released_handle(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny")
    runtime.release_engine(handle)
    with pytest.raises(InferenceError, match="released"):
        runtime.infer(handle, {"x": np.zeros((1, 3), dtype=np.float32)})


def test_release_engine_idempotent_no_handle_corruption(config: Config) -> None:
    runtime = PytorchFp16Runtime(config)
    handle = PytorchEngineHandle(_TinyLinear().eval(), "tiny")
    runtime.release_engine(handle)
    runtime.release_engine(handle)
    assert handle._released is True


def test_register_architecture_rejects_collision(clean_registry: None) -> None:
    register_architecture("clash", _TinyLinear)
    with pytest.raises(ValueError, match="already registered"):
        register_architecture("clash", _DifferentArch)


def test_register_architecture_same_factory_is_idempotent(
    clean_registry: None,
) -> None:
    register_architecture("same", _TinyLinear)
    register_architecture("same", _TinyLinear)  # no raise


# ----------------------------------------------------------------------
# NFRs.


@_REQUIRE_CUDA
@pytest.mark.tier2
def test_nfr_perf_deserialize_under_10s_p95(
    config: Config, tiny_checkpoint: Path, clean_registry: None
) -> None:
    """NFR-perf-deserialize — fixture-sized model; real budget validated on Jetson."""
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    entry = runtime.compile_engine(
        tiny_checkpoint,
        BuildConfig(
            precision=PrecisionMode.FP16,
            workspace_mb=512,
            calibration_dataset=None,
            optimization_profiles=(),
        ),
    )
    samples: list[float] = []
    for _ in range(5):
        t0 = time.perf_counter()
        handle = runtime.deserialize_engine(entry)
        samples.append(time.perf_counter() - t0)
        runtime.release_engine(handle)
    samples.sort()
    p95 = samples[int(0.95 * len(samples))]
    assert p95 <= 10.0, f"deserialize p95={p95:.3f} s exceeds 10 s NFR"


@_REQUIRE_CUDA
def test_nfr_reliability_eval_mode_unconditional(
    config: Config, tmp_path: Path, clean_registry: None
) -> None:
    """A checkpoint saved in training mode MUST land in eval mode."""
    register_architecture("tiny", _TinyLinear)
    runtime = PytorchFp16Runtime(config)
    trained = _TinyLinear()
    trained.train()
    path = tmp_path / "trained.pt"
    torch.save(trained.state_dict(), path)
    entry = EngineCacheEntry(
        engine_path=path,
        sha256_hex="0" * 64,
        sm=None,
        jp=None,
        trt=None,
        precision=PrecisionMode.FP16,
        extras={"model_name": "tiny"},
    )
    handle = runtime.deserialize_engine(entry)
    assert handle._model.training is False