mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 21:21:13 +00:00
49a67f770d
Implements AZ-297 InferenceRuntime's thermal_state() side: a singleton background-thread publisher that polls jtop (preferred) or pynvml (fallback) at config.thermal_poll_hz, stores an atomic ThermalState snapshot, and emits c7.thermal_transition FDR records on every throttle flip with a WARN log on entry and an INFO log on exit. Default-safe on TelemetryUnavailableError per Invariant I-6 with a 1-Hz rate-limited WARN. Sources return a raw ThermalReading; the publisher stamps measured_at_ns via its injected Clock so _JtopSource / _PynvmlSource stay clean of direct time.* calls (Invariant 2). _poll_once is the deterministic test seam — start() spawns the production thread. - c7.thermal_transition registered in fdr_client.records KNOWN_PAYLOAD_KEYS - [telemetry] optional dep group (jetson-stats, pynvml) added to pyproject - 14 unit tests (AC-1..AC-6, AC-8, NFR-default-safe, structural) green; AC-7 / AC-1 microbench / NFR-perf-poll Tier-2 deferred - full unit suite: 1140 passed, 11 expected Tier-2/CUDA skips Co-authored-by: Cursor <cursoragent@cursor.com>
507 lines
16 KiB
Python
507 lines
16 KiB
Python
"""AZ-302 — ``ThermalStatePublisher`` acceptance tests.
|
|
|
|
CPU-only and deterministic: every test drives the publisher through its
|
|
``_poll_once()`` seam with a manual clock + a scripted ``ThermalSource``;
|
|
no real ``jtop`` / ``pynvml`` / background thread is involved (those
|
|
paths live behind the source-selection fallback and are Tier-2 by
|
|
construction — see ``read_host_tuple`` analog comments in AZ-301).
|
|
|
|
The publisher's background thread is exercised only through the
|
|
idempotency tests (AC-6); the throttle-detection-latency contract
|
|
(AC-2 / AC-3) is verified deterministically on the ``_poll_once`` seam
|
|
because the wall-clock latency is dominated by the publisher's own
|
|
period (1 / ``thermal_poll_hz`` s), not by code under test.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from collections import deque
|
|
from typing import Final
|
|
|
|
import pytest
|
|
|
|
from gps_denied_onboard.components.c7_inference import (
|
|
C7InferenceConfig,
|
|
TelemetryUnavailableError,
|
|
ThermalReading,
|
|
ThermalSource,
|
|
ThermalStatePublisher,
|
|
)
|
|
from gps_denied_onboard.components.c7_inference import thermal_publisher as tp_mod
|
|
from gps_denied_onboard.config.schema import Config
|
|
from gps_denied_onboard.fdr_client.fakes import FakeFdrSink
|
|
|
|
_PRODUCER_ID: Final[str] = "c7_inference.thermal"
|
|
_TRANSITION_KIND: Final[str] = "c7.thermal_transition"
|
|
|
|
|
|
class _ManualClock:
|
|
"""Test ``Clock`` that returns whatever ``advance_to`` set last.
|
|
|
|
``monotonic_ns`` and ``time_ns`` share the same backing value because
|
|
AZ-302 only ever reads ``monotonic_ns``; ``time_ns`` is unused.
|
|
"""
|
|
|
|
def __init__(self, *, start_ns: int = 1_000_000_000) -> None:
|
|
self._now_ns: int = start_ns
|
|
|
|
def monotonic_ns(self) -> int:
|
|
return self._now_ns
|
|
|
|
def time_ns(self) -> int:
|
|
return self._now_ns
|
|
|
|
def sleep_until_ns(self, target_ns: int) -> None: # pragma: no cover - unused
|
|
self._now_ns = max(self._now_ns, target_ns)
|
|
|
|
def advance_to(self, ns: int) -> None:
|
|
if ns < self._now_ns:
|
|
raise AssertionError("monotonic clock cannot go backwards")
|
|
self._now_ns = ns
|
|
|
|
def advance_by(self, delta_ns: int) -> None:
|
|
self.advance_to(self._now_ns + delta_ns)
|
|
|
|
|
|
class _ScriptedSource:
|
|
"""``ThermalSource`` whose ``read()`` plays back a scripted queue.
|
|
|
|
Each queue entry is either a :class:`ThermalReading` (returned) or a
|
|
:class:`TelemetryUnavailableError` instance (raised). When the queue
|
|
is drained, the last entry is repeated indefinitely so steady-state
|
|
polls do not need explicit padding.
|
|
"""
|
|
|
|
def __init__(self, script: list[ThermalReading | TelemetryUnavailableError]) -> None:
|
|
if not script:
|
|
raise ValueError("_ScriptedSource needs at least one entry")
|
|
self._queue: deque[ThermalReading | TelemetryUnavailableError] = deque(script)
|
|
self._last: ThermalReading | TelemetryUnavailableError = script[-1]
|
|
self.closed: bool = False
|
|
|
|
def read(self) -> ThermalReading:
|
|
if self._queue:
|
|
head = self._queue.popleft()
|
|
self._last = head
|
|
else:
|
|
head = self._last
|
|
if isinstance(head, TelemetryUnavailableError):
|
|
raise head
|
|
return head
|
|
|
|
def close(self) -> None:
|
|
self.closed = True
|
|
|
|
|
|
@pytest.fixture
|
|
def config() -> Config:
|
|
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=1.0))
|
|
|
|
|
|
@pytest.fixture
|
|
def clock() -> _ManualClock:
|
|
return _ManualClock()
|
|
|
|
|
|
@pytest.fixture
|
|
def fdr() -> FakeFdrSink:
|
|
return FakeFdrSink(producer_id=_PRODUCER_ID)
|
|
|
|
|
|
@pytest.fixture
|
|
def fast_config() -> Config:
|
|
"""High-frequency config so the AC-6 background-thread tests finish quickly."""
|
|
|
|
return Config.with_blocks(c7_inference=C7InferenceConfig(thermal_poll_hz=200.0))
|
|
|
|
|
|
def _make_publisher(
|
|
*,
|
|
config: Config,
|
|
fdr: FakeFdrSink,
|
|
clock: _ManualClock,
|
|
source: ThermalSource | None,
|
|
) -> ThermalStatePublisher:
|
|
return ThermalStatePublisher(config, fdr, source=source, clock=clock)
|
|
|
|
|
|
def _prime_without_thread(
|
|
publisher: ThermalStatePublisher, source: ThermalSource
|
|
) -> None:
|
|
"""Run the source-selection + initial-poll path of ``start()`` without
|
|
spawning the background thread.
|
|
|
|
The deterministic AC-1..AC-4 / AC-8 tests drive :meth:`_poll_once`
|
|
directly under a manual clock; the production thread would race
|
|
those calls and consume scripted source entries non-deterministically
|
|
(one race observed in the first iteration of this test module).
|
|
"""
|
|
|
|
publisher._source = source
|
|
publisher._stop_event.clear()
|
|
publisher._poll_once(initial=True)
|
|
|
|
|
|
def _cool_reading(*, throttle: bool = False) -> ThermalReading:
|
|
return ThermalReading(
|
|
cpu_temp_c=45.0,
|
|
gpu_temp_c=55.0,
|
|
thermal_throttle_active=throttle,
|
|
measured_clock_mhz=1300,
|
|
)
|
|
|
|
|
|
def _hot_reading() -> ThermalReading:
|
|
return ThermalReading(
|
|
cpu_temp_c=85.0,
|
|
gpu_temp_c=92.0,
|
|
thermal_throttle_active=True,
|
|
measured_clock_mhz=600,
|
|
)
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-1: read() returns the most recent _atomic_snapshot.
|
|
|
|
|
|
def test_ac1_read_returns_latest_snapshot(
|
|
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
|
) -> None:
|
|
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
|
|
snapshot_after_initial = publisher.read()
|
|
|
|
clock.advance_by(1_000_000_000)
|
|
publisher._poll_once()
|
|
snapshot_after_transition = publisher.read()
|
|
|
|
assert snapshot_after_initial.thermal_throttle_active is False
|
|
assert snapshot_after_initial.cpu_temp_c == pytest.approx(45.0)
|
|
assert snapshot_after_initial.measured_at_ns == 1_000_000_000
|
|
|
|
assert snapshot_after_transition.thermal_throttle_active is True
|
|
assert snapshot_after_transition.gpu_temp_c == pytest.approx(92.0)
|
|
assert snapshot_after_transition.is_telemetry_available is True
|
|
assert snapshot_after_transition.measured_at_ns == 2_000_000_000
|
|
|
|
|
|
def test_ac1_read_is_wait_free_object_identity(
|
|
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
|
) -> None:
|
|
"""Two consecutive reads with no intervening poll return the same object."""
|
|
|
|
source = _ScriptedSource([_cool_reading()])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
|
|
first = publisher.read()
|
|
second = publisher.read()
|
|
|
|
assert first is second
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-2: throttle entry — FDR + WARN log + state.
|
|
|
|
|
|
def test_ac2_throttle_entry_emits_fdr_and_warn(
|
|
config: Config,
|
|
fdr: FakeFdrSink,
|
|
clock: _ManualClock,
|
|
caplog: pytest.LogCaptureFixture,
|
|
) -> None:
|
|
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
assert fdr.records == []
|
|
|
|
clock.advance_by(1_000_000_000)
|
|
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
|
|
publisher._poll_once()
|
|
|
|
assert len(fdr.records) == 1
|
|
record = fdr.records[0]
|
|
assert record.kind == _TRANSITION_KIND
|
|
assert record.producer_id == _PRODUCER_ID
|
|
assert record.payload["previous_state"] is False
|
|
assert record.payload["new_state"] is True
|
|
assert record.payload["gpu_temp_c"] == pytest.approx(92.0)
|
|
assert record.payload["cpu_temp_c"] == pytest.approx(85.0)
|
|
assert record.payload["measured_clock_mhz"] == 600
|
|
assert record.payload["measured_at_ns"] == 2_000_000_000
|
|
|
|
entry_warnings = [
|
|
r for r in caplog.records
|
|
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle ENTRY"
|
|
]
|
|
assert len(entry_warnings) == 1
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-3: throttle exit — FDR + INFO log.
|
|
|
|
|
|
def test_ac3_throttle_exit_emits_fdr_and_info(
|
|
config: Config,
|
|
fdr: FakeFdrSink,
|
|
clock: _ManualClock,
|
|
caplog: pytest.LogCaptureFixture,
|
|
) -> None:
|
|
source = _ScriptedSource([_hot_reading(), _cool_reading()])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
|
|
clock.advance_by(1_000_000_000)
|
|
with caplog.at_level("INFO", logger="c7_inference.thermal"):
|
|
publisher._poll_once()
|
|
|
|
assert len(fdr.records) == 1
|
|
record = fdr.records[0]
|
|
assert record.kind == _TRANSITION_KIND
|
|
assert record.payload["previous_state"] is True
|
|
assert record.payload["new_state"] is False
|
|
|
|
exit_infos = [
|
|
r for r in caplog.records
|
|
if r.levelname == "INFO" and r.getMessage() == "thermal throttle exit"
|
|
]
|
|
assert len(exit_infos) == 1
|
|
exit_warnings = [
|
|
r for r in caplog.records
|
|
if r.levelname == "WARNING" and r.getMessage() == "thermal throttle exit"
|
|
]
|
|
assert exit_warnings == []
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-4: telemetry unavailable on every poll — default-safe + rate-limited WARN.
|
|
|
|
|
|
def test_ac4_telemetry_unavailable_defaults_safe_and_rate_limits(
|
|
config: Config,
|
|
fdr: FakeFdrSink,
|
|
clock: _ManualClock,
|
|
caplog: pytest.LogCaptureFixture,
|
|
) -> None:
|
|
err = TelemetryUnavailableError("jtop hung")
|
|
source = _ScriptedSource([err, err, err, err, err, err])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
|
|
with caplog.at_level("WARNING", logger="c7_inference.thermal"):
|
|
_prime_without_thread(publisher, source)
|
|
# Drive five more polls across a 2-second simulated window.
|
|
clock.advance_by(500_000_000)
|
|
publisher._poll_once()
|
|
clock.advance_by(500_000_000)
|
|
publisher._poll_once()
|
|
clock.advance_by(500_000_000)
|
|
publisher._poll_once()
|
|
clock.advance_by(500_000_000)
|
|
publisher._poll_once()
|
|
clock.advance_by(500_000_000)
|
|
publisher._poll_once()
|
|
|
|
snapshot = publisher.read()
|
|
|
|
assert snapshot.is_telemetry_available is False
|
|
assert snapshot.thermal_throttle_active is False
|
|
assert snapshot.gpu_temp_c is None
|
|
assert snapshot.cpu_temp_c is None
|
|
|
|
unavailable_warnings = [
|
|
r for r in caplog.records
|
|
if r.levelname == "WARNING" and r.getMessage() == "thermal telemetry unavailable"
|
|
]
|
|
# Initial poll (start) + 5 polls x 500 ms each = 2.5 s window; rate-limit
|
|
# at 1 / sec ⇒ at most 3 WARNs (one immediately + one each second).
|
|
assert 1 <= len(unavailable_warnings) <= 3
|
|
assert fdr.records == []
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-5: cold-start with no source raises.
|
|
|
|
|
|
def test_ac5_start_raises_when_no_source_available(
|
|
config: Config,
|
|
fdr: FakeFdrSink,
|
|
clock: _ManualClock,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
def _raise(*_args: object, **_kw: object) -> None:
|
|
raise TelemetryUnavailableError("backend missing")
|
|
|
|
monkeypatch.setattr(tp_mod, "_JtopSource", _raise)
|
|
monkeypatch.setattr(tp_mod, "_PynvmlSource", _raise)
|
|
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=None)
|
|
|
|
with pytest.raises(TelemetryUnavailableError):
|
|
publisher.start()
|
|
|
|
assert publisher.is_running() is False
|
|
|
|
|
|
def test_ac5_jtop_falls_back_to_pynvml(
|
|
fast_config: Config,
|
|
fdr: FakeFdrSink,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
"""When ``jtop`` fails, ``start()`` picks the ``pynvml`` source and runs."""
|
|
|
|
pynvml_source = _ScriptedSource(
|
|
[_cool_reading()] + [_cool_reading()] * 16
|
|
)
|
|
|
|
def _raise_jtop(*_args: object, **_kw: object) -> None:
|
|
raise TelemetryUnavailableError("no jtop")
|
|
|
|
def _make_pynvml(*_args: object, **_kw: object) -> _ScriptedSource:
|
|
return pynvml_source
|
|
|
|
monkeypatch.setattr(tp_mod, "_JtopSource", _raise_jtop)
|
|
monkeypatch.setattr(tp_mod, "_PynvmlSource", _make_pynvml)
|
|
|
|
publisher = ThermalStatePublisher(fast_config, fdr)
|
|
publisher.start()
|
|
assert publisher.is_running() is True
|
|
snapshot = publisher.read()
|
|
publisher.stop()
|
|
|
|
assert snapshot.is_telemetry_available is True
|
|
assert pynvml_source.closed is True
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-6: start/stop is idempotent.
|
|
|
|
|
|
def test_ac6_double_start_is_noop(
|
|
fast_config: Config, fdr: FakeFdrSink
|
|
) -> None:
|
|
source = _ScriptedSource([_cool_reading()])
|
|
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
|
|
|
publisher.start()
|
|
first_thread = publisher._thread
|
|
publisher.start()
|
|
second_thread = publisher._thread
|
|
|
|
assert publisher.is_running() is True
|
|
assert first_thread is second_thread
|
|
|
|
publisher.stop()
|
|
|
|
|
|
def test_ac6_double_stop_is_noop(
|
|
fast_config: Config, fdr: FakeFdrSink
|
|
) -> None:
|
|
source = _ScriptedSource([_cool_reading()])
|
|
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
|
|
|
publisher.start()
|
|
publisher.stop()
|
|
publisher.stop()
|
|
|
|
assert publisher.is_running() is False
|
|
|
|
|
|
def test_background_thread_polls(
|
|
fast_config: Config, fdr: FakeFdrSink
|
|
) -> None:
|
|
"""End-to-end: real thread, 200 Hz poll rate, source flips on second read."""
|
|
|
|
readings: deque[ThermalReading] = deque(
|
|
[_cool_reading()] + [_hot_reading()] * 64
|
|
)
|
|
|
|
class _Live:
|
|
closed = False
|
|
|
|
def read(self) -> ThermalReading:
|
|
if readings:
|
|
return readings.popleft()
|
|
return _hot_reading()
|
|
|
|
def close(self) -> None:
|
|
self._closed_flag = True
|
|
|
|
source = _Live()
|
|
publisher = ThermalStatePublisher(fast_config, fdr, source=source)
|
|
publisher.start()
|
|
deadline = time.monotonic() + 1.0
|
|
while publisher.read().thermal_throttle_active is False and time.monotonic() < deadline:
|
|
time.sleep(0.005)
|
|
publisher.stop()
|
|
|
|
assert publisher.read().thermal_throttle_active is True
|
|
assert any(r.kind == _TRANSITION_KIND for r in fdr.records)
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# AC-8: FDR record envelope matches the schema (round-trip via serialise/parse).
|
|
|
|
|
|
def test_ac8_fdr_record_round_trips_through_wire_format(
|
|
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
|
) -> None:
|
|
from gps_denied_onboard.fdr_client.records import KNOWN_PAYLOAD_KEYS, parse, serialise
|
|
|
|
source = _ScriptedSource([_cool_reading(), _hot_reading()])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
clock.advance_by(1_000_000_000)
|
|
publisher._poll_once()
|
|
|
|
record = fdr.records[0]
|
|
encoded = serialise(record)
|
|
decoded = parse(encoded)
|
|
|
|
assert decoded.kind == _TRANSITION_KIND
|
|
assert decoded.producer_id == _PRODUCER_ID
|
|
expected_keys = KNOWN_PAYLOAD_KEYS[_TRANSITION_KIND]
|
|
assert set(decoded.payload.keys()) - {"extra"} <= expected_keys
|
|
assert decoded.payload == record.payload
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# NFR-reliability-default-safe: first poll fails → snapshot is default-safe.
|
|
|
|
|
|
def test_nfr_default_safe_on_first_poll_failure(
|
|
config: Config, fdr: FakeFdrSink, clock: _ManualClock
|
|
) -> None:
|
|
err = TelemetryUnavailableError("source dead on arrival")
|
|
source = _ScriptedSource([err])
|
|
publisher = _make_publisher(config=config, fdr=fdr, clock=clock, source=source)
|
|
_prime_without_thread(publisher, source)
|
|
|
|
snapshot = publisher.read()
|
|
|
|
assert snapshot.is_telemetry_available is False
|
|
assert snapshot.thermal_throttle_active is False
|
|
assert snapshot.gpu_temp_c is None
|
|
assert snapshot.cpu_temp_c is None
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Structural: ThermalSource is a runtime-checkable Protocol; the scripted
|
|
# source satisfies it.
|
|
|
|
|
|
def test_scripted_source_satisfies_protocol() -> None:
|
|
source = _ScriptedSource([_cool_reading()])
|
|
assert isinstance(source, ThermalSource)
|
|
|
|
|
|
def test_default_safe_snapshot_invariant_i6() -> None:
|
|
"""``is_telemetry_available == False`` ⇒ ``thermal_throttle_active == False``."""
|
|
|
|
snap = tp_mod._default_safe_snapshot(measured_at_ns=42)
|
|
assert snap.is_telemetry_available is False
|
|
assert snap.thermal_throttle_active is False
|
|
assert snap.measured_at_ns == 42
|