mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-22 23:41:13 +00:00
[AZ-302] C7 ThermalStatePublisher — jtop/NVML 1 Hz background poller
Implements AZ-297 InferenceRuntime's thermal_state() side: a singleton background-thread publisher that polls jtop (preferred) or pynvml (fallback) at config.thermal_poll_hz, stores an atomic ThermalState snapshot, and emits c7.thermal_transition FDR records on every throttle flip with a WARN log on entry and an INFO log on exit. Default-safe on TelemetryUnavailableError per Invariant I-6 with a 1-Hz rate-limited WARN. Sources return a raw ThermalReading; the publisher stamps measured_at_ns via its injected Clock so _JtopSource / _PynvmlSource stay clean of direct time.* calls (Invariant 2). _poll_once is the deterministic test seam — start() spawns the production thread. - c7.thermal_transition registered in fdr_client.records KNOWN_PAYLOAD_KEYS - [telemetry] optional dep group (jetson-stats, pynvml) added to pyproject - 14 unit tests (AC-1..AC-6, AC-8, NFR-default-safe, structural) green; AC-7 / AC-1 microbench / NFR-perf-poll Tier-2 deferred - full unit suite: 1140 passed, 11 expected Tier-2/CUDA skips Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,506 @@
|
||||
"""AZ-302 — ``ThermalStatePublisher`` acceptance tests.
|
||||
|
||||
CPU-only and deterministic: every test drives the publisher through its
|
||||
``_poll_once()`` seam with a manual clock + a scripted ``ThermalSource``;
|
||||
no real ``jtop`` / ``pynvml`` / background thread is involved (those
|
||||
paths live behind the source-selection fallback and are Tier-2 by
|
||||
construction — see ``read_host_tuple`` analog comments in AZ-301).
|
||||
|
||||
The publisher's background thread is exercised only through the
|
||||
idempotency tests (AC-6); the throttle-detection-latency contract
|
||||
(AC-2 / AC-3) is verified deterministically on the ``_poll_once`` seam
|
||||
because the wall-clock latency is dominated by the publisher's own
|
||||
period (1 / ``thermal_poll_hz`` s), not by code under test.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
|
||||
from gps_denied_onboard.components.c7_inference import (
|
||||
C7InferenceConfig,
|
||||
TelemetryUnavailableError,
|
||||
ThermalReading,
|
||||
ThermalSource,
|
||||
ThermalStatePublisher,
|
||||
)
|
||||
from gps_denied_onboard.components.c7_inference import thermal_publisher as tp_mod
|
||||
from gps_denied_onboard.config.schema import Config
|
||||
from gps_denied_onboard.fdr_client.fakes import FakeFdrSink
|
||||
|
||||
_PRODUCER_ID: Final[str] = "c7_inference.thermal"
|
||||
_TRANSITION_KIND: Final[str] = "c7.thermal_transition"
|
||||
|
||||
|
||||
class _ManualClock:
|
||||
"""Test ``Clock`` that returns whatever ``advance_to`` set last.
|
||||
|
||||
``monotonic_ns`` and ``time_ns`` share the same backing value because
|
||||
AZ-302 only ever reads ``monotonic_ns``; ``time_ns`` is unused.
|
||||
"""
|
||||
|
||||
def __init__(self, *, start_ns: int = 1_000_000_000) -> None:
|
||||
self._now_ns: int = start_ns
|
||||
|
||||
def monotonic_ns(self) -> int:
|
||||
return self._now_ns
|
||||
|
||||
def time_ns(self) -> int:
|
||||
return self._now_ns
|
||||
|
||||
def sleep_until_ns(self, target_ns: int) -> None: # pragma: no cover - unused
|
||||
self._now_ns = max(self._now_ns, target_ns)
|
||||
|
||||
def advance_to(self, ns: int) -> None:
|
||||
if ns < self._now_ns:
|
||||
raise AssertionError("monotonic clock cannot go backwards")
|
||||
self._now_ns = ns
|
||||
|
||||
def advance_by(self, delta_ns: int) -> None:
|
||||
self.advance_to(self._now_ns + delta_ns)
|
||||
|
||||
|
||||
class _ScriptedSource:
|
||||
"""``ThermalSource`` whose ``read()`` plays back a scripted queue.
|
||||
|
||||
Each queue entry is either a :class:`ThermalReading` (returned) or a
|
||||
:class:`TelemetryUnavailableError` instance (raised). When the queue
|
||||
is drained, the last entry is repeated indefinitely so steady-state
|
||||
polls do not need explicit padding.
|
||||
"""
|
||||
|
||||
def __init__(self, script: list[ThermalReading | TelemetryUnavailableError]) -> None:
|
||||
if not script:
|
||||
raise ValueError("_ScriptedSource needs at least one entry")
|
||||
self._queue: deque[ThermalReading | TelemetryUnavailableError] = deque(script)
|
||||
self._last: ThermalReading | TelemetryUnavailableError = script[-1]
|
||||
self.closed: bool = False
|
||||
|
||||
def read(self) -> ThermalReading:
|
||||
if self._queue:
|
||||
head = self._queue.popleft()
|
||||
self._last = head
|
||||
else:
|
||||
head = self._last
|
||||
if isinstance(head, TelemetryUnavailableError):
|
||||
raise head
|
||||
return head
|
||||
|
||||
def close(self) -> None:
|
||||
self.closed = True
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def config() -> Config:
|
||||
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=1.0))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def clock() -> _ManualClock:
|
||||
return _ManualClock()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fdr() -> FakeFdrSink:
|
||||
return FakeFdrSink(producer_id=_PRODUCER_ID)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fast_config() -> Config:
|
||||
"""High-frequency config so the AC-6 background-thread tests finish quickly."""
|
||||
|
||||
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=200.0))
|
||||
|
||||
|
||||
def _make_publisher(
|
||||
*,
|
||||
config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
clock: _ManualClock,
|
||||
source: ThermalSource | None,
|
||||
) -> ThermalStatePublisher:
|
||||
return ThermalStatePublisher(config, fdr, source=source, clock=clock)
|
||||
|
||||
|
||||
def _prime_without_thread(
|
||||
publisher: ThermalStatePublisher, source: ThermalSource
|
||||
) -> None:
|
||||
"""Run the source-selection + initial-poll path of ``start()`` without
|
||||
spawning the background thread.
|
||||
|
||||
The deterministic AC-1..AC-4 / AC-8 tests drive :meth:`_poll_once`
|
||||
directly under a manual clock; the production thread would race
|
||||
those calls and consume scripted source entries non-deterministically
|
||||
(one race observed in the first iteration of this test module).
|
||||
"""
|
||||
|
||||
publisher._source = source
|
||||
publisher._stop_event.clear()
|
||||
publisher._poll_once(initial=True)
|
||||
|
||||
|
||||
def _cool_reading(*, throttle: bool = False) -> ThermalReading:
|
||||
return ThermalReading(
|
||||
cpu_temp_c=45.0,
|
||||
gpu_temp_c=55.0,
|
||||
thermal_throttle_active=throttle,
|
||||
measured_clock_mhz=1300,
|
||||
)
|
||||
|
||||
|
||||
def _hot_reading() -> ThermalReading:
|
||||
return ThermalReading(
|
||||
cpu_temp_c=85.0,
|
||||
gpu_temp_c=92.0,
|
||||
thermal_throttle_active=True,
|
||||
measured_clock_mhz=600,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-1: read() returns the most recent _atomic_snapshot.
|
||||
|
||||
|
||||
def test_ac1_read_returns_latest_snapshot(
|
||||
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
||||
) -> None:
|
||||
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
|
||||
snapshot_after_initial = publisher.read()
|
||||
|
||||
clock.advance_by(1_000_000_000)
|
||||
publisher._poll_once()
|
||||
snapshot_after_transition = publisher.read()
|
||||
|
||||
assert snapshot_after_initial.thermal_throttle_active is False
|
||||
assert snapshot_after_initial.cpu_temp_c == pytest.approx(45.0)
|
||||
assert snapshot_after_initial.measured_at_ns == 1_000_000_000
|
||||
|
||||
assert snapshot_after_transition.thermal_throttle_active is True
|
||||
assert snapshot_after_transition.gpu_temp_c == pytest.approx(92.0)
|
||||
assert snapshot_after_transition.is_telemetry_available is True
|
||||
assert snapshot_after_transition.measured_at_ns == 2_000_000_000
|
||||
|
||||
|
||||
def test_ac1_read_is_wait_free_object_identity(
|
||||
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
||||
) -> None:
|
||||
"""Two consecutive reads with no intervening poll return the same object."""
|
||||
|
||||
source = _ScriptedSource([_cool_reading()])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
|
||||
first = publisher.read()
|
||||
second = publisher.read()
|
||||
|
||||
assert first is second
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-2: throttle entry — FDR + WARN log + state.
|
||||
|
||||
|
||||
def test_ac2_throttle_entry_emits_fdr_and_warn(
|
||||
config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
clock: _ManualClock,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
assert fdr.records == []
|
||||
|
||||
clock.advance_by(1_000_000_000)
|
||||
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
|
||||
publisher._poll_once()
|
||||
|
||||
assert len(fdr.records) == 1
|
||||
record = fdr.records[0]
|
||||
assert record.kind == _TRANSITION_KIND
|
||||
assert record.producer_id == _PRODUCER_ID
|
||||
assert record.payload["previous_state"] is False
|
||||
assert record.payload["new_state"] is True
|
||||
assert record.payload["gpu_temp_c"] == pytest.approx(92.0)
|
||||
assert record.payload["cpu_temp_c"] == pytest.approx(85.0)
|
||||
assert record.payload["measured_clock_mhz"] == 600
|
||||
assert record.payload["measured_at_ns"] == 2_000_000_000
|
||||
|
||||
entry_warnings = [
|
||||
r for r in caplog.records
|
||||
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle ENTRY"
|
||||
]
|
||||
assert len(entry_warnings) == 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-3: throttle exit — FDR + INFO log.
|
||||
|
||||
|
||||
def test_ac3_throttle_exit_emits_fdr_and_info(
|
||||
config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
clock: _ManualClock,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
source = _ScriptedSource([_hot_reading(), _cool_reading()])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
|
||||
clock.advance_by(1_000_000_000)
|
||||
with caplog.at_level("INFO", logger="c7_inference.thermal"):
|
||||
publisher._poll_once()
|
||||
|
||||
assert len(fdr.records) == 1
|
||||
record = fdr.records[0]
|
||||
assert record.kind == _TRANSITION_KIND
|
||||
assert record.payload["previous_state"] is True
|
||||
assert record.payload["new_state"] is False
|
||||
|
||||
exit_infos = [
|
||||
r for r in caplog.records
|
||||
if r.levelname == "INFO" and r.getMessage() == "thermal throttle exit"
|
||||
]
|
||||
assert len(exit_infos) == 1
|
||||
exit_warnings = [
|
||||
r for r in caplog.records
|
||||
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle exit"
|
||||
]
|
||||
assert exit_warnings == []
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-4: telemetry unavailable on every poll — default-safe + rate-limited WARN.
|
||||
|
||||
|
||||
def test_ac4_telemetry_unavailable_defaults_safe_and_rate_limits(
|
||||
config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
clock: _ManualClock,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
err = TelemetryUnavailableError("jtop hung")
|
||||
source = _ScriptedSource([err, err, err, err, err, err])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
|
||||
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
|
||||
_prime_without_thread(publisher, source)
|
||||
# Drive five more polls across a 2-second simulated window.
|
||||
clock.advance_by(500_000_000)
|
||||
publisher._poll_once()
|
||||
clock.advance_by(500_000_000)
|
||||
publisher._poll_once()
|
||||
clock.advance_by(500_000_000)
|
||||
publisher._poll_once()
|
||||
clock.advance_by(500_000_000)
|
||||
publisher._poll_once()
|
||||
clock.advance_by(500_000_000)
|
||||
publisher._poll_once()
|
||||
|
||||
snapshot = publisher.read()
|
||||
|
||||
assert snapshot.is_telemetry_available is False
|
||||
assert snapshot.thermal_throttle_active is False
|
||||
assert snapshot.gpu_temp_c is None
|
||||
assert snapshot.cpu_temp_c is None
|
||||
|
||||
unavailable_warnings = [
|
||||
r for r in caplog.records
|
||||
if r.levelname == "WARNING" and r.getMessage() == "thermal telemetry unavailable"
|
||||
]
|
||||
# Initial poll (start) + 5 polls x 500 ms each = 2.5 s window; rate-limit
|
||||
# at 1 / sec ⇒ at most 3 WARNs (one immediately + one each second).
|
||||
assert 1 <= len(unavailable_warnings) <= 3
|
||||
assert fdr.records == []
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-5: cold-start with no source raises.
|
||||
|
||||
|
||||
def test_ac5_start_raises_when_no_source_available(
|
||||
config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
clock: _ManualClock,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
def _raise(*_args: object, **_kw: object) -> None:
|
||||
raise TelemetryUnavailableError("backend missing")
|
||||
|
||||
monkeypatch.setattr(tp_mod, "_JtopSource", _raise)
|
||||
monkeypatch.setattr(tp_mod, "_PynvmlSource", _raise)
|
||||
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=None)
|
||||
|
||||
with pytest.raises(TelemetryUnavailableError):
|
||||
publisher.start()
|
||||
|
||||
assert publisher.is_running() is False
|
||||
|
||||
|
||||
def test_ac5_jtop_falls_back_to_pynvml(
|
||||
fast_config: Config,
|
||||
fdr: FakeFdrSink,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""When ``jtop`` fails, ``start()`` picks the ``pynvml`` source and runs."""
|
||||
|
||||
pynvml_source = _ScriptedSource(
|
||||
[_cool_reading()] + [_cool_reading()] * 16
|
||||
)
|
||||
|
||||
def _raise_jtop(*_args: object, **_kw: object) -> None:
|
||||
raise TelemetryUnavailableError("no jtop")
|
||||
|
||||
def _make_pynvml(*_args: object, **_kw: object) -> _ScriptedSource:
|
||||
return pynvml_source
|
||||
|
||||
monkeypatch.setattr(tp_mod, "_JtopSource", _raise_jtop)
|
||||
monkeypatch.setattr(tp_mod, "_PynvmlSource", _make_pynvml)
|
||||
|
||||
publisher = ThermalStatePublisher(fast_config, fdr)
|
||||
publisher.start()
|
||||
assert publisher.is_running() is True
|
||||
snapshot = publisher.read()
|
||||
publisher.stop()
|
||||
|
||||
assert snapshot.is_telemetry_available is True
|
||||
assert pynvml_source.closed is True
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-6: start/stop is idempotent.
|
||||
|
||||
|
||||
def test_ac6_double_start_is_noop(
|
||||
fast_config: Config, fdr: FakeFdrSink
|
||||
) -> None:
|
||||
source = _ScriptedSource([_cool_reading()])
|
||||
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
||||
|
||||
publisher.start()
|
||||
first_thread = publisher._thread
|
||||
publisher.start()
|
||||
second_thread = publisher._thread
|
||||
|
||||
assert publisher.is_running() is True
|
||||
assert first_thread is second_thread
|
||||
|
||||
publisher.stop()
|
||||
|
||||
|
||||
def test_ac6_double_stop_is_noop(
|
||||
fast_config: Config, fdr: FakeFdrSink
|
||||
) -> None:
|
||||
source = _ScriptedSource([_cool_reading()])
|
||||
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
||||
|
||||
publisher.start()
|
||||
publisher.stop()
|
||||
publisher.stop()
|
||||
|
||||
assert publisher.is_running() is False
|
||||
|
||||
|
||||
def test_background_thread_polls(
|
||||
fast_config: Config, fdr: FakeFdrSink
|
||||
) -> None:
|
||||
"""End-to-end: real thread, 200 Hz poll rate, source flips on second read."""
|
||||
|
||||
readings: deque[ThermalReading] = deque(
|
||||
[_cool_reading()] + [_hot_reading()] * 64
|
||||
)
|
||||
|
||||
class _Live:
|
||||
closed = False
|
||||
|
||||
def read(self) -> ThermalReading:
|
||||
if readings:
|
||||
return readings.popleft()
|
||||
return _hot_reading()
|
||||
|
||||
def close(self) -> None:
|
||||
self._closed_flag = True
|
||||
|
||||
source = _Live()
|
||||
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
||||
publisher.start()
|
||||
deadline = time.monotonic() + 1.0
|
||||
while publisher.read().thermal_throttle_active is False and time.monotonic() < deadline:
|
||||
time.sleep(0.005)
|
||||
publisher.stop()
|
||||
|
||||
assert publisher.read().thermal_throttle_active is True
|
||||
assert any(r.kind == _TRANSITION_KIND for r in fdr.records)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# AC-8: FDR record envelope matches the schema (round-trip via serialise/parse).
|
||||
|
||||
|
||||
def test_ac8_fdr_record_round_trips_through_wire_format(
|
||||
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
||||
) -> None:
|
||||
from gps_denied_onboard.fdr_client.records import KNOWN_PAYLOAD_KEYS, parse, serialise
|
||||
|
||||
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
clock.advance_by(1_000_000_000)
|
||||
publisher._poll_once()
|
||||
|
||||
record = fdr.records[0]
|
||||
encoded = serialise(record)
|
||||
decoded = parse(encoded)
|
||||
|
||||
assert decoded.kind == _TRANSITION_KIND
|
||||
assert decoded.producer_id == _PRODUCER_ID
|
||||
expected_keys = KNOWN_PAYLOAD_KEYS[_TRANSITION_KIND]
|
||||
assert set(decoded.payload.keys()) - {"extra"} <= expected_keys
|
||||
assert decoded.payload == record.payload
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# NFR-reliability-default-safe: first poll fails → snapshot is default-safe.
|
||||
|
||||
|
||||
def test_nfr_default_safe_on_first_poll_failure(
|
||||
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
||||
) -> None:
|
||||
err = TelemetryUnavailableError("source dead on arrival")
|
||||
source = _ScriptedSource([err])
|
||||
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
||||
_prime_without_thread(publisher, source)
|
||||
|
||||
snapshot = publisher.read()
|
||||
|
||||
assert snapshot.is_telemetry_available is False
|
||||
assert snapshot.thermal_throttle_active is False
|
||||
assert snapshot.gpu_temp_c is None
|
||||
assert snapshot.cpu_temp_c is None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Structural: ThermalSource is a runtime-checkable Protocol; the scripted
|
||||
# source satisfies it.
|
||||
|
||||
|
||||
def test_scripted_source_satisfies_protocol() -> None:
|
||||
source = _ScriptedSource([_cool_reading()])
|
||||
assert isinstance(source, ThermalSource)
|
||||
|
||||
|
||||
def test_default_safe_snapshot_invariant_i6() -> None:
|
||||
"""``is_telemetry_available == False`` ⇒ ``thermal_throttle_active == False``."""
|
||||
|
||||
snap = tp_mod._default_safe_snapshot(measured_at_ns=42)
|
||||
assert snap.is_telemetry_available is False
|
||||
assert snap.thermal_throttle_active is False
|
||||
assert snap.measured_at_ns == 42
|
||||
Reference in New Issue
Block a user