[AZ-273] [AZ-274] [AZ-275] [AZ-267] [AZ-268] FDR producer chain + log bridge + contract test

AZ-273: lock-free SPSC ring buffer with pre-allocated slots, power-of-
two capacity, opt-in SPSC guard, and EnqueueResult / FdrSpscViolationError
on the public surface. make_fdr_client caches one client per producer_id
and reads capacity from config.fdr.per_producer_capacity with fallback
to queue_size.
AZ-274: default_overrun_policy implements drop-oldest + retry + immediate
marker emission, with prior-marker dropped_count folding via _evict_one
so user-loss info is never lost across iterations. ERROR diagnostic is
rate-limited to <=1/sec per producer.
AZ-275: FakeFdrSink mirrors the FdrClient public surface and reuses the
production default_overrun_policy via a duck-typed _PolicyAdapter. The
test-only records/all_records_ever properties let component tests assert
both in-buffer and lifetime state. tests/conftest.py registers the
fake_fdr_sink fixture and an AST architecture lint forbids production
imports of fakes.
AZ-267: FdrLogBridgeHandler installs on the root logger via wire_log_bridge
and forwards only WARN+ERROR records into the FDR with kind="log".
Thread-local recursion guard short-circuits internal logging; saturated-
queue diagnostics go to stderr every N=1000 drops.
AZ-268: tests/contract/log_schema.py covers every row of the schema's
Test Cases table plus the "DEBUG+INFO never reach FDR" invariant.
pyproject.toml registers the contract pytest marker and the
contract-mandated log_schema.py file-name.
251 unit + contract tests pass (48 new). Review verdict:
PASS_WITH_WARNINGS; findings are NFR-perf deferrals + documented
relaxation of AZ-274 AC-2 coalescing under permanently-stalled consumer.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-11 03:00:49 +03:00
parent 3acc7f33dd
commit ba20c2d195
24 changed files with 2714 additions and 20 deletions
+217 -6
View File
@@ -1,16 +1,227 @@
"""FDR producer-side client API — STUB.
"""``FdrClient`` — producer-side FDR queue (AZ-273 / E-CC-FDR-CLIENT).
Concrete client is owned by AZ-273. Bootstrap exposes the class so every component
can type `fdr: FdrClient` on its constructor.
The single handle every onboard producer holds. Calls :meth:`enqueue`
on its component-local frame; never blocks. The consumer side is the
C13 writer thread (AZ-248) which drains via :meth:`pop_one` /
:meth:`drain`.
Public surface frozen by
``_docs/02_document/contracts/shared_fdr_client/fdr_client_protocol.md``
v1.0.0.
Capacity sourcing
-----------------
``make_fdr_client(producer_id, config)`` resolves the per-producer
capacity in this precedence order:
1. ``config.fdr.per_producer_capacity[producer_id]`` if present.
2. ``config.fdr.queue_size`` (the documented cross-cutting default).
If the resolved value is not a positive power of two, it is rounded UP
to the next power of two (and clipped to :data:`MIN_CAPACITY`).
"""
from __future__ import annotations
from collections.abc import Callable
from typing import Final
from gps_denied_onboard.config import Config
from gps_denied_onboard.fdr_client.queue import (
MIN_CAPACITY,
FdrSpscViolationError,
SpscRingBuffer,
)
from gps_denied_onboard.fdr_client.records import FdrRecord
from gps_denied_onboard.logging import get_logger
__all__ = [
"EnqueueResult",
"FdrClient",
"FdrSpscViolationError",
"make_fdr_client",
]
class EnqueueResult:
"""Return value of :meth:`FdrClient.enqueue`.
String enum (not :class:`enum.Enum`) so the steady-state path
returns an interned string instead of allocating a wrapper.
"""
OK: Final[str] = "ok"
OVERRUN: Final[str] = "overrun"
_DIAG_LOGGER_NAME: Final[str] = "shared.fdr_client"
def _next_power_of_two(value: int) -> int:
"""Round ``value`` UP to the next power of two (clipped to ``MIN_CAPACITY``)."""
if value <= MIN_CAPACITY:
return MIN_CAPACITY
if value & (value - 1) == 0:
return value
return 1 << value.bit_length()
class FdrClient:
"""Producer-side FDR API: enqueue records, drop-oldest on overrun."""
"""Producer-side FDR queue handle for a single component."""
def emit(self, record: FdrRecord) -> None:
raise NotImplementedError("FdrClient.emit concrete impl is AZ-273")
def __init__(
self,
producer_id: str,
capacity: int,
*,
enforce_spsc: bool = False,
_emit_diag_log: bool = True,
) -> None:
if not isinstance(producer_id, str) or not producer_id:
raise ValueError(
f"FdrClient.producer_id must be a non-empty string; got {producer_id!r}"
)
normalised_capacity = _next_power_of_two(capacity)
self._buffer: SpscRingBuffer = SpscRingBuffer(
normalised_capacity, enforce_spsc=enforce_spsc
)
self._producer_id: str = producer_id
self._on_overrun: Callable[[FdrRecord], None] | None = None
if _emit_diag_log:
# One-time INFO at construction (NOT on the steady-state path).
get_logger(_DIAG_LOGGER_NAME).info(
"FdrClient constructed",
extra={
"component": _DIAG_LOGGER_NAME,
"kind": "fdr.client_constructed",
"kv": {"producer_id": producer_id, "capacity": normalised_capacity},
},
)
@property
def producer_id(self) -> str:
return self._producer_id
@property
def on_overrun(self) -> Callable[[FdrRecord], None] | None:
return self._on_overrun
@on_overrun.setter
def on_overrun(self, hook: Callable[[FdrRecord], None] | None) -> None:
if hook is not None and not callable(hook):
raise TypeError(f"on_overrun hook must be callable or None; got {hook!r}")
self._on_overrun = hook
def enqueue(self, record: FdrRecord) -> str:
"""Non-blocking single-producer enqueue.
Returns :attr:`EnqueueResult.OK` on success or
:attr:`EnqueueResult.OVERRUN` when the buffer is full.
On overrun, the ``on_overrun`` hook (if set) is invoked exactly
once with the offending record before returning.
"""
ok = self._buffer.push(record)
if ok:
return EnqueueResult.OK
hook = self._on_overrun
if hook is not None:
try:
hook(record)
except Exception:
# Overrun-path closure errors are swallowed to keep the
# producer's hot path free of exceptions. The policy's
# own error logging (AZ-274 NFR-reliability) records
# what went wrong.
pass
return EnqueueResult.OVERRUN
def pop_one(self) -> FdrRecord | None:
"""Single-consumer dequeue. Returns ``None`` when the buffer is empty."""
return self._buffer.pop()
def drain(self, max_records: int) -> list[FdrRecord]:
"""Single-consumer drain up to ``max_records`` records in FIFO order."""
return self._buffer.drain(max_records)
def flush(self) -> None:
"""Test-only: spin until the buffer is empty.
Production code MUST NOT call this on the hot path.
"""
while not self._buffer.is_empty():
pass
def drop_oldest_for_overrun(self) -> FdrRecord | None:
"""Producer-side drop-oldest used by the AZ-274 overrun policy.
Public so the overrun closure can pop the head without
violating the SPSC contract on the consumer side.
"""
return self._buffer.drop_oldest()
def _capacity(self) -> int:
"""Test-only introspection of the underlying buffer's capacity (AZ-273 AC-3)."""
return self._buffer.capacity
def _buffer_size(self) -> int:
"""Test-only introspection of the buffer's current size."""
return self._buffer.size()
_CACHE: dict[str, FdrClient] = {}
def _resolve_capacity(producer_id: str, config: Config) -> int:
"""Pick the per-producer capacity from config; fall back to the default."""
per_producer = getattr(config.fdr, "per_producer_capacity", {}) or {}
override = per_producer.get(producer_id) if isinstance(per_producer, dict) else None
if override is not None:
if not isinstance(override, int) or isinstance(override, bool):
raise ValueError(
f"config.fdr.per_producer_capacity[{producer_id!r}] must be a "
f"non-bool int; got {override!r}"
)
if override <= 0:
raise ValueError(
f"config.fdr.per_producer_capacity[{producer_id!r}] must be > 0; got {override}"
)
return override
return config.fdr.queue_size
def make_fdr_client(producer_id: str, config: Config) -> FdrClient:
"""Construct (or return cached) FdrClient for ``producer_id``.
Cached: two calls with the same ``producer_id`` return the same
instance for the lifetime of the process. ``_reset_for_tests()``
clears the cache.
"""
if not isinstance(producer_id, str) or not producer_id:
raise ValueError(
f"make_fdr_client.producer_id must be a non-empty string; got {producer_id!r}"
)
cached = _CACHE.get(producer_id)
if cached is not None:
return cached
capacity = _resolve_capacity(producer_id, config)
client = FdrClient(producer_id=producer_id, capacity=capacity)
# Wire the default drop-oldest overrun policy. Imported lazily so
# importing ``FdrClient`` for typing purposes does not pull in the
# policy module's logger handle.
from gps_denied_onboard.fdr_client.overrun_policy import default_overrun_policy
client.on_overrun = default_overrun_policy(client)
_CACHE[producer_id] = client
return client
def _reset_for_tests() -> None:
"""Clear the cache. Documented test-only entry per contract Non-Goals."""
_CACHE.clear()
def _cached_clients() -> dict[str, FdrClient]:
"""Test-only snapshot of the cache used by AZ-274 AC-4."""
return dict(_CACHE)