"""AZ-302 — ``ThermalStatePublisher`` acceptance tests. CPU-only and deterministic: every test drives the publisher through its ``_poll_once()`` seam with a manual clock + a scripted ``ThermalSource``; no real ``jtop`` / ``pynvml`` / background thread is involved (those paths live behind the source-selection fallback and are Tier-2 by construction — see ``read_host_tuple`` analog comments in AZ-301). The publisher's background thread is exercised only through the idempotency tests (AC-6); the throttle-detection-latency contract (AC-2 / AC-3) is verified deterministically on the ``_poll_once`` seam because the wall-clock latency is dominated by the publisher's own period (1 / ``thermal_poll_hz`` s), not by code under test. """ from __future__ import annotations import time from collections import deque from typing import Final import pytest from gps_denied_onboard.components.c7_inference import ( C7InferenceConfig, TelemetryUnavailableError, ThermalReading, ThermalSource, ThermalStatePublisher, ) from gps_denied_onboard.components.c7_inference import thermal_publisher as tp_mod from gps_denied_onboard.config.schema import Config from gps_denied_onboard.fdr_client.fakes import FakeFdrSink _PRODUCER_ID: Final[str] = "c7_inference.thermal" _TRANSITION_KIND: Final[str] = "c7.thermal_transition" class _ManualClock: """Test ``Clock`` that returns whatever ``advance_to`` set last. ``monotonic_ns`` and ``time_ns`` share the same backing value because AZ-302 only ever reads ``monotonic_ns``; ``time_ns`` is unused. """ def __init__(self, *, start_ns: int = 1_000_000_000) -> None: self._now_ns: int = start_ns def monotonic_ns(self) -> int: return self._now_ns def time_ns(self) -> int: return self._now_ns def sleep_until_ns(self, target_ns: int) -> None: # pragma: no cover - unused self._now_ns = max(self._now_ns, target_ns) def advance_to(self, ns: int) -> None: if ns < self._now_ns: raise AssertionError("monotonic clock cannot go backwards") self._now_ns = ns def advance_by(self, delta_ns: int) -> None: self.advance_to(self._now_ns + delta_ns) class _ScriptedSource: """``ThermalSource`` whose ``read()`` plays back a scripted queue. Each queue entry is either a :class:`ThermalReading` (returned) or a :class:`TelemetryUnavailableError` instance (raised). When the queue is drained, the last entry is repeated indefinitely so steady-state polls do not need explicit padding. """ def __init__(self, script: list[ThermalReading | TelemetryUnavailableError]) -> None: if not script: raise ValueError("_ScriptedSource needs at least one entry") self._queue: deque[ThermalReading | TelemetryUnavailableError] = deque(script) self._last: ThermalReading | TelemetryUnavailableError = script[-1] self.closed: bool = False def read(self) -> ThermalReading: if self._queue: head = self._queue.popleft() self._last = head else: head = self._last if isinstance(head, TelemetryUnavailableError): raise head return head def close(self) -> None: self.closed = True @pytest.fixture def config() -> Config: return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=1.0)) @pytest.fixture def clock() -> _ManualClock: return _ManualClock() @pytest.fixture def fdr() -> FakeFdrSink: return FakeFdrSink(producer_id=_PRODUCER_ID) @pytest.fixture def fast_config() -> Config: """High-frequency config so the AC-6 background-thread tests finish quickly.""" return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=200.0)) def _make_publisher( *, config: Config, fdr: FakeFdrSink, clock: _ManualClock, source: ThermalSource | None, ) -> ThermalStatePublisher: return ThermalStatePublisher(config, fdr, source=source, clock=clock) def _prime_without_thread( publisher: ThermalStatePublisher, source: ThermalSource ) -> None: """Run the source-selection + initial-poll path of ``start()`` without spawning the background thread. The deterministic AC-1..AC-4 / AC-8 tests drive :meth:`_poll_once` directly under a manual clock; the production thread would race those calls and consume scripted source entries non-deterministically (one race observed in the first iteration of this test module). """ publisher._source = source publisher._stop_event.clear() publisher._poll_once(initial=True) def _cool_reading(*, throttle: bool = False) -> ThermalReading: return ThermalReading( cpu_temp_c=45.0, gpu_temp_c=55.0, thermal_throttle_active=throttle, measured_clock_mhz=1300, ) def _hot_reading() -> ThermalReading: return ThermalReading( cpu_temp_c=85.0, gpu_temp_c=92.0, thermal_throttle_active=True, measured_clock_mhz=600, ) # ---------------------------------------------------------------------- # AC-1: read() returns the most recent _atomic_snapshot. def test_ac1_read_returns_latest_snapshot( config: Config, fdr: FakeFdrSink, clock: _ManualClock ) -> None: source = _ScriptedSource([_cool_reading(), _hot_reading()]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) snapshot_after_initial = publisher.read() clock.advance_by(1_000_000_000) publisher._poll_once() snapshot_after_transition = publisher.read() assert snapshot_after_initial.thermal_throttle_active is False assert snapshot_after_initial.cpu_temp_c == pytest.approx(45.0) assert snapshot_after_initial.measured_at_ns == 1_000_000_000 assert snapshot_after_transition.thermal_throttle_active is True assert snapshot_after_transition.gpu_temp_c == pytest.approx(92.0) assert snapshot_after_transition.is_telemetry_available is True assert snapshot_after_transition.measured_at_ns == 2_000_000_000 def test_ac1_read_is_wait_free_object_identity( config: Config, fdr: FakeFdrSink, clock: _ManualClock ) -> None: """Two consecutive reads with no intervening poll return the same object.""" source = _ScriptedSource([_cool_reading()]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) first = publisher.read() second = publisher.read() assert first is second # ---------------------------------------------------------------------- # AC-2: throttle entry — FDR + WARN log + state. def test_ac2_throttle_entry_emits_fdr_and_warn( config: Config, fdr: FakeFdrSink, clock: _ManualClock, caplog: pytest.LogCaptureFixture, ) -> None: source = _ScriptedSource([_cool_reading(), _hot_reading()]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) assert fdr.records == [] clock.advance_by(1_000_000_000) with caplog.at_level("WARNING", logger="c7_inference.thermal"): publisher._poll_once() assert len(fdr.records) == 1 record = fdr.records[0] assert record.kind == _TRANSITION_KIND assert record.producer_id == _PRODUCER_ID assert record.payload["previous_state"] is False assert record.payload["new_state"] is True assert record.payload["gpu_temp_c"] == pytest.approx(92.0) assert record.payload["cpu_temp_c"] == pytest.approx(85.0) assert record.payload["measured_clock_mhz"] == 600 assert record.payload["measured_at_ns"] == 2_000_000_000 entry_warnings = [ r for r in caplog.records if r.levelname == "WARNING" and r.getMessage() == "thermal throttle ENTRY" ] assert len(entry_warnings) == 1 # ---------------------------------------------------------------------- # AC-3: throttle exit — FDR + INFO log. def test_ac3_throttle_exit_emits_fdr_and_info( config: Config, fdr: FakeFdrSink, clock: _ManualClock, caplog: pytest.LogCaptureFixture, ) -> None: source = _ScriptedSource([_hot_reading(), _cool_reading()]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) clock.advance_by(1_000_000_000) with caplog.at_level("INFO", logger="c7_inference.thermal"): publisher._poll_once() assert len(fdr.records) == 1 record = fdr.records[0] assert record.kind == _TRANSITION_KIND assert record.payload["previous_state"] is True assert record.payload["new_state"] is False exit_infos = [ r for r in caplog.records if r.levelname == "INFO" and r.getMessage() == "thermal throttle exit" ] assert len(exit_infos) == 1 exit_warnings = [ r for r in caplog.records if r.levelname == "WARNING" and r.getMessage() == "thermal throttle exit" ] assert exit_warnings == [] # ---------------------------------------------------------------------- # AC-4: telemetry unavailable on every poll — default-safe + rate-limited WARN. def test_ac4_telemetry_unavailable_defaults_safe_and_rate_limits( config: Config, fdr: FakeFdrSink, clock: _ManualClock, caplog: pytest.LogCaptureFixture, ) -> None: err = TelemetryUnavailableError("jtop hung") source = _ScriptedSource([err, err, err, err, err, err]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) with caplog.at_level("WARNING", logger="c7_inference.thermal"): _prime_without_thread(publisher, source) # Drive five more polls across a 2-second simulated window. clock.advance_by(500_000_000) publisher._poll_once() clock.advance_by(500_000_000) publisher._poll_once() clock.advance_by(500_000_000) publisher._poll_once() clock.advance_by(500_000_000) publisher._poll_once() clock.advance_by(500_000_000) publisher._poll_once() snapshot = publisher.read() assert snapshot.is_telemetry_available is False assert snapshot.thermal_throttle_active is False assert snapshot.gpu_temp_c is None assert snapshot.cpu_temp_c is None unavailable_warnings = [ r for r in caplog.records if r.levelname == "WARNING" and r.getMessage() == "thermal telemetry unavailable" ] # Initial poll (start) + 5 polls x 500 ms each = 2.5 s window; rate-limit # at 1 / sec ⇒ at most 3 WARNs (one immediately + one each second). assert 1 <= len(unavailable_warnings) <= 3 assert fdr.records == [] # ---------------------------------------------------------------------- # AC-5: cold-start with no source raises. def test_ac5_start_raises_when_no_source_available( config: Config, fdr: FakeFdrSink, clock: _ManualClock, monkeypatch: pytest.MonkeyPatch, ) -> None: def _raise(*_args: object, **_kw: object) -> None: raise TelemetryUnavailableError("backend missing") monkeypatch.setattr(tp_mod, "_JtopSource", _raise) monkeypatch.setattr(tp_mod, "_PynvmlSource", _raise) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=None) with pytest.raises(TelemetryUnavailableError): publisher.start() assert publisher.is_running() is False def test_ac5_jtop_falls_back_to_pynvml( fast_config: Config, fdr: FakeFdrSink, monkeypatch: pytest.MonkeyPatch, ) -> None: """When ``jtop`` fails, ``start()`` picks the ``pynvml`` source and runs.""" pynvml_source = _ScriptedSource( [_cool_reading()] + [_cool_reading()] * 16 ) def _raise_jtop(*_args: object, **_kw: object) -> None: raise TelemetryUnavailableError("no jtop") def _make_pynvml(*_args: object, **_kw: object) -> _ScriptedSource: return pynvml_source monkeypatch.setattr(tp_mod, "_JtopSource", _raise_jtop) monkeypatch.setattr(tp_mod, "_PynvmlSource", _make_pynvml) publisher = ThermalStatePublisher(fast_config, fdr) publisher.start() assert publisher.is_running() is True snapshot = publisher.read() publisher.stop() assert snapshot.is_telemetry_available is True assert pynvml_source.closed is True # ---------------------------------------------------------------------- # AC-6: start/stop is idempotent. def test_ac6_double_start_is_noop( fast_config: Config, fdr: FakeFdrSink ) -> None: source = _ScriptedSource([_cool_reading()]) publisher = ThermalStatePublisher(fast_config, fdr, source=source) publisher.start() first_thread = publisher._thread publisher.start() second_thread = publisher._thread assert publisher.is_running() is True assert first_thread is second_thread publisher.stop() def test_ac6_double_stop_is_noop( fast_config: Config, fdr: FakeFdrSink ) -> None: source = _ScriptedSource([_cool_reading()]) publisher = ThermalStatePublisher(fast_config, fdr, source=source) publisher.start() publisher.stop() publisher.stop() assert publisher.is_running() is False def test_background_thread_polls( fast_config: Config, fdr: FakeFdrSink ) -> None: """End-to-end: real thread, 200 Hz poll rate, source flips on second read.""" readings: deque[ThermalReading] = deque( [_cool_reading()] + [_hot_reading()] * 64 ) class _Live: closed = False def read(self) -> ThermalReading: if readings: return readings.popleft() return _hot_reading() def close(self) -> None: self._closed_flag = True source = _Live() publisher = ThermalStatePublisher(fast_config, fdr, source=source) publisher.start() deadline = time.monotonic() + 1.0 while publisher.read().thermal_throttle_active is False and time.monotonic() < deadline: time.sleep(0.005) publisher.stop() assert publisher.read().thermal_throttle_active is True assert any(r.kind == _TRANSITION_KIND for r in fdr.records) # ---------------------------------------------------------------------- # AC-8: FDR record envelope matches the schema (round-trip via serialise/parse). def test_ac8_fdr_record_round_trips_through_wire_format( config: Config, fdr: FakeFdrSink, clock: _ManualClock ) -> None: from gps_denied_onboard.fdr_client.records import KNOWN_PAYLOAD_KEYS, parse, serialise source = _ScriptedSource([_cool_reading(), _hot_reading()]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) clock.advance_by(1_000_000_000) publisher._poll_once() record = fdr.records[0] encoded = serialise(record) decoded = parse(encoded) assert decoded.kind == _TRANSITION_KIND assert decoded.producer_id == _PRODUCER_ID expected_keys = KNOWN_PAYLOAD_KEYS[_TRANSITION_KIND] assert set(decoded.payload.keys()) - {"extra"} <= expected_keys assert decoded.payload == record.payload # ---------------------------------------------------------------------- # NFR-reliability-default-safe: first poll fails → snapshot is default-safe. def test_nfr_default_safe_on_first_poll_failure( config: Config, fdr: FakeFdrSink, clock: _ManualClock ) -> None: err = TelemetryUnavailableError("source dead on arrival") source = _ScriptedSource([err]) publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source) _prime_without_thread(publisher, source) snapshot = publisher.read() assert snapshot.is_telemetry_available is False assert snapshot.thermal_throttle_active is False assert snapshot.gpu_temp_c is None assert snapshot.cpu_temp_c is None # ---------------------------------------------------------------------- # Structural: ThermalSource is a runtime-checkable Protocol; the scripted # source satisfies it. def test_scripted_source_satisfies_protocol() -> None: source = _ScriptedSource([_cool_reading()]) assert isinstance(source, ThermalSource) def test_default_safe_snapshot_invariant_i6() -> None: """``is_telemetry_available == False`` ⇒ ``thermal_throttle_active == False``.""" snap = tp_mod._default_safe_snapshot(measured_at_ns=42) assert snap.is_telemetry_available is False assert snap.thermal_throttle_active is False assert snap.measured_at_ns == 42