Files
gps-denied-onboard/tests/unit/c7_inference/test_thermal_publisher.py
T
Oleksandr Bezdieniezhnykh 49a67f770d [AZ-302] C7 ThermalStatePublisher — jtop/NVML 1 Hz background poller
Implements AZ-297 InferenceRuntime's thermal_state() side: a singleton
background-thread publisher that polls jtop (preferred) or pynvml
(fallback) at config.thermal_poll_hz, stores an atomic ThermalState
snapshot, and emits c7.thermal_transition FDR records on every throttle
flip with a WARN log on entry and an INFO log on exit. Default-safe on
TelemetryUnavailableError per Invariant I-6 with a 1-Hz rate-limited
WARN.

Sources return a raw ThermalReading; the publisher stamps measured_at_ns
via its injected Clock so _JtopSource / _PynvmlSource stay clean of
direct time.* calls (Invariant 2). _poll_once is the deterministic test
seam — start() spawns the production thread.

- c7.thermal_transition registered in fdr_client.records KNOWN_PAYLOAD_KEYS
- [telemetry] optional dep group (jetson-stats, pynvml) added to pyproject
- 14 unit tests (AC-1..AC-6, AC-8, NFR-default-safe, structural)
  green; AC-7 / AC-1 microbench / NFR-perf-poll Tier-2 deferred
- full unit suite: 1140 passed, 11 expected Tier-2/CUDA skips

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 10:33:37 +03:00

507 lines
16 KiB
Python

"""AZ-302 — ``ThermalStatePublisher`` acceptance tests.
CPU-only and deterministic: every test drives the publisher through its
``_poll_once()`` seam with a manual clock + a scripted ``ThermalSource``;
no real ``jtop`` / ``pynvml`` / background thread is involved (those
paths live behind the source-selection fallback and are Tier-2 by
construction — see ``read_host_tuple`` analog comments in AZ-301).
The publisher's background thread is exercised only through the
idempotency tests (AC-6); the throttle-detection-latency contract
(AC-2 / AC-3) is verified deterministically on the ``_poll_once`` seam
because the wall-clock latency is dominated by the publisher's own
period (1 / ``thermal_poll_hz`` s), not by code under test.
"""
from __future__ import annotations
import time
from collections import deque
from typing import Final
import pytest
from gps_denied_onboard.components.c7_inference import (
C7InferenceConfig,
TelemetryUnavailableError,
ThermalReading,
ThermalSource,
ThermalStatePublisher,
)
from gps_denied_onboard.components.c7_inference import thermal_publisher as tp_mod
from gps_denied_onboard.config.schema import Config
from gps_denied_onboard.fdr_client.fakes import FakeFdrSink
_PRODUCER_ID: Final[str] = "c7_inference.thermal"
_TRANSITION_KIND: Final[str] = "c7.thermal_transition"
class _ManualClock:
"""Test ``Clock`` that returns whatever ``advance_to`` set last.
``monotonic_ns`` and ``time_ns`` share the same backing value because
AZ-302 only ever reads ``monotonic_ns``; ``time_ns`` is unused.
"""
def __init__(self, *, start_ns: int = 1_000_000_000) -> None:
self._now_ns: int = start_ns
def monotonic_ns(self) -> int:
return self._now_ns
def time_ns(self) -> int:
return self._now_ns
def sleep_until_ns(self, target_ns: int) -> None: # pragma: no cover - unused
self._now_ns = max(self._now_ns, target_ns)
def advance_to(self, ns: int) -> None:
if ns < self._now_ns:
raise AssertionError("monotonic clock cannot go backwards")
self._now_ns = ns
def advance_by(self, delta_ns: int) -> None:
self.advance_to(self._now_ns + delta_ns)
class _ScriptedSource:
"""``ThermalSource`` whose ``read()`` plays back a scripted queue.
Each queue entry is either a :class:`ThermalReading` (returned) or a
:class:`TelemetryUnavailableError` instance (raised). When the queue
is drained, the last entry is repeated indefinitely so steady-state
polls do not need explicit padding.
"""
def __init__(self, script: list[ThermalReading | TelemetryUnavailableError]) -> None:
if not script:
raise ValueError("_ScriptedSource needs at least one entry")
self._queue: deque[ThermalReading | TelemetryUnavailableError] = deque(script)
self._last: ThermalReading | TelemetryUnavailableError = script[-1]
self.closed: bool = False
def read(self) -> ThermalReading:
if self._queue:
head = self._queue.popleft()
self._last = head
else:
head = self._last
if isinstance(head, TelemetryUnavailableError):
raise head
return head
def close(self) -> None:
self.closed = True
@pytest.fixture
def config() -> Config:
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=1.0))
@pytest.fixture
def clock() -> _ManualClock:
return _ManualClock()
@pytest.fixture
def fdr() -> FakeFdrSink:
return FakeFdrSink(producer_id=_PRODUCER_ID)
@pytest.fixture
def fast_config() -> Config:
"""High-frequency config so the AC-6 background-thread tests finish quickly."""
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=200.0))
def _make_publisher(
*,
config: Config,
fdr: FakeFdrSink,
clock: _ManualClock,
source: ThermalSource | None,
) -> ThermalStatePublisher:
return ThermalStatePublisher(config, fdr, source=source, clock=clock)
def _prime_without_thread(
publisher: ThermalStatePublisher, source: ThermalSource
) -> None:
"""Run the source-selection + initial-poll path of ``start()`` without
spawning the background thread.
The deterministic AC-1..AC-4 / AC-8 tests drive :meth:`_poll_once`
directly under a manual clock; the production thread would race
those calls and consume scripted source entries non-deterministically
(one race observed in the first iteration of this test module).
"""
publisher._source = source
publisher._stop_event.clear()
publisher._poll_once(initial=True)
def _cool_reading(*, throttle: bool = False) -> ThermalReading:
return ThermalReading(
cpu_temp_c=45.0,
gpu_temp_c=55.0,
thermal_throttle_active=throttle,
measured_clock_mhz=1300,
)
def _hot_reading() -> ThermalReading:
return ThermalReading(
cpu_temp_c=85.0,
gpu_temp_c=92.0,
thermal_throttle_active=True,
measured_clock_mhz=600,
)
# ----------------------------------------------------------------------
# AC-1: read() returns the most recent _atomic_snapshot.
def test_ac1_read_returns_latest_snapshot(
config: Config, fdr: FakeFdrSink, clock: _ManualClock
) -> None:
source = _ScriptedSource([_cool_reading(), _hot_reading()])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
snapshot_after_initial = publisher.read()
clock.advance_by(1_000_000_000)
publisher._poll_once()
snapshot_after_transition = publisher.read()
assert snapshot_after_initial.thermal_throttle_active is False
assert snapshot_after_initial.cpu_temp_c == pytest.approx(45.0)
assert snapshot_after_initial.measured_at_ns == 1_000_000_000
assert snapshot_after_transition.thermal_throttle_active is True
assert snapshot_after_transition.gpu_temp_c == pytest.approx(92.0)
assert snapshot_after_transition.is_telemetry_available is True
assert snapshot_after_transition.measured_at_ns == 2_000_000_000
def test_ac1_read_is_wait_free_object_identity(
config: Config, fdr: FakeFdrSink, clock: _ManualClock
) -> None:
"""Two consecutive reads with no intervening poll return the same object."""
source = _ScriptedSource([_cool_reading()])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
first = publisher.read()
second = publisher.read()
assert first is second
# ----------------------------------------------------------------------
# AC-2: throttle entry — FDR + WARN log + state.
def test_ac2_throttle_entry_emits_fdr_and_warn(
config: Config,
fdr: FakeFdrSink,
clock: _ManualClock,
caplog: pytest.LogCaptureFixture,
) -> None:
source = _ScriptedSource([_cool_reading(), _hot_reading()])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
assert fdr.records == []
clock.advance_by(1_000_000_000)
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
publisher._poll_once()
assert len(fdr.records) == 1
record = fdr.records[0]
assert record.kind == _TRANSITION_KIND
assert record.producer_id == _PRODUCER_ID
assert record.payload["previous_state"] is False
assert record.payload["new_state"] is True
assert record.payload["gpu_temp_c"] == pytest.approx(92.0)
assert record.payload["cpu_temp_c"] == pytest.approx(85.0)
assert record.payload["measured_clock_mhz"] == 600
assert record.payload["measured_at_ns"] == 2_000_000_000
entry_warnings = [
r for r in caplog.records
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle ENTRY"
]
assert len(entry_warnings) == 1
# ----------------------------------------------------------------------
# AC-3: throttle exit — FDR + INFO log.
def test_ac3_throttle_exit_emits_fdr_and_info(
config: Config,
fdr: FakeFdrSink,
clock: _ManualClock,
caplog: pytest.LogCaptureFixture,
) -> None:
source = _ScriptedSource([_hot_reading(), _cool_reading()])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
clock.advance_by(1_000_000_000)
with caplog.at_level("INFO", logger="c7_inference.thermal"):
publisher._poll_once()
assert len(fdr.records) == 1
record = fdr.records[0]
assert record.kind == _TRANSITION_KIND
assert record.payload["previous_state"] is True
assert record.payload["new_state"] is False
exit_infos = [
r for r in caplog.records
if r.levelname == "INFO" and r.getMessage() == "thermal throttle exit"
]
assert len(exit_infos) == 1
exit_warnings = [
r for r in caplog.records
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle exit"
]
assert exit_warnings == []
# ----------------------------------------------------------------------
# AC-4: telemetry unavailable on every poll — default-safe + rate-limited WARN.
def test_ac4_telemetry_unavailable_defaults_safe_and_rate_limits(
config: Config,
fdr: FakeFdrSink,
clock: _ManualClock,
caplog: pytest.LogCaptureFixture,
) -> None:
err = TelemetryUnavailableError("jtop hung")
source = _ScriptedSource([err, err, err, err, err, err])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
_prime_without_thread(publisher, source)
# Drive five more polls across a 2-second simulated window.
clock.advance_by(500_000_000)
publisher._poll_once()
clock.advance_by(500_000_000)
publisher._poll_once()
clock.advance_by(500_000_000)
publisher._poll_once()
clock.advance_by(500_000_000)
publisher._poll_once()
clock.advance_by(500_000_000)
publisher._poll_once()
snapshot = publisher.read()
assert snapshot.is_telemetry_available is False
assert snapshot.thermal_throttle_active is False
assert snapshot.gpu_temp_c is None
assert snapshot.cpu_temp_c is None
unavailable_warnings = [
r for r in caplog.records
if r.levelname == "WARNING" and r.getMessage() == "thermal telemetry unavailable"
]
# Initial poll (start) + 5 polls x 500 ms each = 2.5 s window; rate-limit
# at 1 / sec ⇒ at most 3 WARNs (one immediately + one each second).
assert 1 <= len(unavailable_warnings) <= 3
assert fdr.records == []
# ----------------------------------------------------------------------
# AC-5: cold-start with no source raises.
def test_ac5_start_raises_when_no_source_available(
config: Config,
fdr: FakeFdrSink,
clock: _ManualClock,
monkeypatch: pytest.MonkeyPatch,
) -> None:
def _raise(*_args: object, **_kw: object) -> None:
raise TelemetryUnavailableError("backend missing")
monkeypatch.setattr(tp_mod, "_JtopSource", _raise)
monkeypatch.setattr(tp_mod, "_PynvmlSource", _raise)
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=None)
with pytest.raises(TelemetryUnavailableError):
publisher.start()
assert publisher.is_running() is False
def test_ac5_jtop_falls_back_to_pynvml(
fast_config: Config,
fdr: FakeFdrSink,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When ``jtop`` fails, ``start()`` picks the ``pynvml`` source and runs."""
pynvml_source = _ScriptedSource(
[_cool_reading()] + [_cool_reading()] * 16
)
def _raise_jtop(*_args: object, **_kw: object) -> None:
raise TelemetryUnavailableError("no jtop")
def _make_pynvml(*_args: object, **_kw: object) -> _ScriptedSource:
return pynvml_source
monkeypatch.setattr(tp_mod, "_JtopSource", _raise_jtop)
monkeypatch.setattr(tp_mod, "_PynvmlSource", _make_pynvml)
publisher = ThermalStatePublisher(fast_config, fdr)
publisher.start()
assert publisher.is_running() is True
snapshot = publisher.read()
publisher.stop()
assert snapshot.is_telemetry_available is True
assert pynvml_source.closed is True
# ----------------------------------------------------------------------
# AC-6: start/stop is idempotent.
def test_ac6_double_start_is_noop(
fast_config: Config, fdr: FakeFdrSink
) -> None:
source = _ScriptedSource([_cool_reading()])
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
publisher.start()
first_thread = publisher._thread
publisher.start()
second_thread = publisher._thread
assert publisher.is_running() is True
assert first_thread is second_thread
publisher.stop()
def test_ac6_double_stop_is_noop(
fast_config: Config, fdr: FakeFdrSink
) -> None:
source = _ScriptedSource([_cool_reading()])
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
publisher.start()
publisher.stop()
publisher.stop()
assert publisher.is_running() is False
def test_background_thread_polls(
fast_config: Config, fdr: FakeFdrSink
) -> None:
"""End-to-end: real thread, 200 Hz poll rate, source flips on second read."""
readings: deque[ThermalReading] = deque(
[_cool_reading()] + [_hot_reading()] * 64
)
class _Live:
closed = False
def read(self) -> ThermalReading:
if readings:
return readings.popleft()
return _hot_reading()
def close(self) -> None:
self._closed_flag = True
source = _Live()
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
publisher.start()
deadline = time.monotonic() + 1.0
while publisher.read().thermal_throttle_active is False and time.monotonic() < deadline:
time.sleep(0.005)
publisher.stop()
assert publisher.read().thermal_throttle_active is True
assert any(r.kind == _TRANSITION_KIND for r in fdr.records)
# ----------------------------------------------------------------------
# AC-8: FDR record envelope matches the schema (round-trip via serialise/parse).
def test_ac8_fdr_record_round_trips_through_wire_format(
config: Config, fdr: FakeFdrSink, clock: _ManualClock
) -> None:
from gps_denied_onboard.fdr_client.records import KNOWN_PAYLOAD_KEYS, parse, serialise
source = _ScriptedSource([_cool_reading(), _hot_reading()])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
clock.advance_by(1_000_000_000)
publisher._poll_once()
record = fdr.records[0]
encoded = serialise(record)
decoded = parse(encoded)
assert decoded.kind == _TRANSITION_KIND
assert decoded.producer_id == _PRODUCER_ID
expected_keys = KNOWN_PAYLOAD_KEYS[_TRANSITION_KIND]
assert set(decoded.payload.keys()) - {"extra"} <= expected_keys
assert decoded.payload == record.payload
# ----------------------------------------------------------------------
# NFR-reliability-default-safe: first poll fails → snapshot is default-safe.
def test_nfr_default_safe_on_first_poll_failure(
config: Config, fdr: FakeFdrSink, clock: _ManualClock
) -> None:
err = TelemetryUnavailableError("source dead on arrival")
source = _ScriptedSource([err])
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
_prime_without_thread(publisher, source)
snapshot = publisher.read()
assert snapshot.is_telemetry_available is False
assert snapshot.thermal_throttle_active is False
assert snapshot.gpu_temp_c is None
assert snapshot.cpu_temp_c is None
# ----------------------------------------------------------------------
# Structural: ThermalSource is a runtime-checkable Protocol; the scripted
# source satisfies it.
def test_scripted_source_satisfies_protocol() -> None:
source = _ScriptedSource([_cool_reading()])
assert isinstance(source, ThermalSource)
def test_default_safe_snapshot_invariant_i6() -> None:
"""``is_telemetry_available == False`` ⇒ ``thermal_throttle_active == False``."""
snap = tp_mod._default_safe_snapshot(measured_at_ns=42)
assert snap.is_telemetry_available is False
assert snap.thermal_throttle_active is False
assert snap.measured_at_ns == 42