[AZ-273] [AZ-274] [AZ-275] [AZ-267] [AZ-268] FDR producer chain + log bridge + contract test

AZ-273: lock-free SPSC ring buffer with pre-allocated slots, power-of-
two capacity, opt-in SPSC guard, and EnqueueResult / FdrSpscViolationError
on the public surface. make_fdr_client caches one client per producer_id
and reads capacity from config.fdr.per_producer_capacity with fallback
to queue_size.
AZ-274: default_overrun_policy implements drop-oldest + retry + immediate
marker emission, with prior-marker dropped_count folding via _evict_one
so user-loss info is never lost across iterations. ERROR diagnostic is
rate-limited to <=1/sec per producer.
AZ-275: FakeFdrSink mirrors the FdrClient public surface and reuses the
production default_overrun_policy via a duck-typed _PolicyAdapter. The
test-only records/all_records_ever properties let component tests assert
both in-buffer and lifetime state. tests/conftest.py registers the
fake_fdr_sink fixture and an AST architecture lint forbids production
imports of fakes.
AZ-267: FdrLogBridgeHandler installs on the root logger via wire_log_bridge
and forwards only WARN+ERROR records into the FDR with kind="log".
Thread-local recursion guard short-circuits internal logging; saturated-
queue diagnostics go to stderr every N=1000 drops.
AZ-268: tests/contract/log_schema.py covers every row of the schema's
Test Cases table plus the "DEBUG+INFO never reach FDR" invariant.
pyproject.toml registers the contract pytest marker and the
contract-mandated log_schema.py file-name.
251 unit + contract tests pass (48 new). Review verdict:
PASS_WITH_WARNINGS; findings are NFR-perf deferrals + documented
relaxation of AZ-274 AC-2 coalescing under permanently-stalled consumer.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-11 03:00:49 +03:00
parent 3acc7f33dd
commit ba20c2d195
24 changed files with 2714 additions and 20 deletions
+9 -2
View File
@@ -1,8 +1,15 @@
"""Structured JSON logging entrypoint (E-CC-LOG / AZ-245 / AZ-266).
Public surface — every component imports `get_logger` from here. The
handler topology is selected by `configure_logging(tier=...)` at the
Public surface — every component imports ``get_logger`` from here. The
handler topology is selected by ``configure_logging(tier=...)`` at the
composition-root entrypoint.
The FDR log bridge (AZ-267) is wiring code used only by the composition
root. It is intentionally NOT re-exported here: importing it would
trigger a circular import (the bridge depends on ``fdr_client.client``,
which itself logs via ``get_logger``). Composition-root code imports
``from gps_denied_onboard.logging.fdr_bridge import wire_log_bridge``
explicitly.
"""
from gps_denied_onboard.logging.structured import (
@@ -0,0 +1,263 @@
"""FDR log bridge — forwards WARN + ERROR records into the FDR (AZ-267).
A :class:`logging.Handler` subclass installed on the root onboard logger
(or each named logger). For every emitted record at level WARN or
ERROR, it builds a ``kind="log"`` :class:`FdrRecord` carrying the
schema-mandated 7 fields and enqueues into a producer-side
:class:`FdrClient`. INFO and DEBUG records are dropped at the handler's
level filter — they never reach the FDR (AC-3).
Public surface frozen by
``_docs/02_document/contracts/shared_logging/log_record_schema.md``
v1.0.0 (the wire shape of the resulting record's payload) and
``_docs/02_document/contracts/shared_fdr_client/fdr_client_protocol.md``
v1.0.0 (the queue we enqueue into).
Concurrency
-----------
* The handler may run on any logger thread.
* A thread-local recursion guard short-circuits any logging call that
originates from inside the handler itself — without this guard, a
failure path that emits a diagnostic WARN would recurse through the
same handler indefinitely.
* Saturated-queue diagnostics throttle to 1-per-``_DROP_LOG_EVERY_N``
occurrences via stdout (not via the bridge) to avoid the same loop.
"""
from __future__ import annotations
import datetime as _dt
import logging
import sys
import threading
from collections.abc import Callable
from typing import Any, Final
from gps_denied_onboard.fdr_client.client import EnqueueResult
from gps_denied_onboard.fdr_client.records import (
CURRENT_SCHEMA_VERSION,
FdrRecord,
)
# The resolver's return value must expose ``enqueue(record) -> str``; both
# the production ``FdrClient`` and the test ``FakeFdrSink`` satisfy that.
# Typed as ``Any`` here to avoid pulling FakeFdrSink (a tests-only fake)
# into the production module's type contract.
_FdrLikeResolver = Callable[[str], Any]
__all__ = [
"FdrLogBridgeHandler",
"wire_log_bridge",
]
_BRIDGE_HANDLER_MARKER_ATTR: Final[str] = "_gps_denied_fdr_bridge_handler"
# Throttle the stdout "queue saturated" diagnostic to one in every N
# occurrences per NFR-reliability ("at least N>=1000").
_DROP_LOG_EVERY_N: Final[int] = 1000
# Level-name normalisation matching the schema contract (``WARNING`` ->
# ``WARN``). Mirrors the logic in ``logging.structured._normalise_level``;
# duplicated locally to keep this module self-contained and avoid a
# circular import.
def _normalise_level(stdlib_levelname: str) -> str:
if stdlib_levelname == "WARNING":
return "WARN"
return stdlib_levelname
def _iso_utc(created_epoch: float) -> str:
"""RFC 3339 UTC with microsecond precision + ``Z`` suffix."""
dt = _dt.datetime.fromtimestamp(created_epoch, tz=_dt.timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.") + f"{dt.microsecond:06d}Z"
# Reserved stdlib LogRecord attributes that must not leak into the FDR
# record's payload.kv. Mirrors the same list in
# ``logging.structured._RESERVED_LOG_RECORD_KEYS``.
_RESERVED_LOG_RECORD_KEYS: Final[frozenset[str]] = frozenset(
{
"args",
"asctime",
"created",
"exc_info",
"exc_text",
"filename",
"funcName",
"levelname",
"levelno",
"lineno",
"message",
"module",
"msecs",
"msg",
"name",
"pathname",
"process",
"processName",
"relativeCreated",
"stack_info",
"taskName",
"thread",
"threadName",
"frame_id",
"kind",
"kv",
"component",
}
)
def _coerce_jsonable(value: Any) -> Any:
"""Mirror of the formatter's coercer; raises on non-JSON-safe types."""
if isinstance(value, (str, int, float, bool)) or value is None:
return value
if isinstance(value, dict):
return {str(k): _coerce_jsonable(v) for k, v in value.items()}
if isinstance(value, (list, tuple)):
return [_coerce_jsonable(v) for v in value]
raise TypeError(f"unserialisable kv payload type: {type(value).__name__}")
class FdrLogBridgeHandler(logging.Handler):
"""Forwards WARN + ERROR :class:`logging.LogRecord` into the FDR.
Constructed with a callable that resolves to the per-record
:class:`FdrClient`. The callable lets the composition root inject
either a per-component sink (one FdrClient per component) or a
shared "log fan-out" client; either way the bridge stays decoupled
from the registry.
"""
_recursion_local = threading.local()
def __init__(
self,
fdr_client_resolver: _FdrLikeResolver,
*,
level: int = logging.WARNING,
) -> None:
super().__init__(level=level)
if not callable(fdr_client_resolver):
raise TypeError(f"fdr_client_resolver must be callable; got {fdr_client_resolver!r}")
self._resolver: _FdrLikeResolver = fdr_client_resolver
self._drop_counter: int = 0
self._drop_lock: threading.Lock = threading.Lock()
setattr(self, _BRIDGE_HANDLER_MARKER_ATTR, True)
def emit(self, record: logging.LogRecord) -> None:
if getattr(self._recursion_local, "in_bridge", False):
# Short-circuit: a log call originating from inside the
# bridge (saturated-queue diagnostic, etc.) must NOT loop.
return
self._recursion_local.in_bridge = True
try:
self._emit_unguarded(record)
finally:
self._recursion_local.in_bridge = False
def _emit_unguarded(self, record: logging.LogRecord) -> None:
try:
fdr_record = self._build_fdr_record(record)
except Exception as exc:
# Translation failed (e.g. non-serialisable kv that even the
# formatter's fallback didn't catch). Skip this record;
# never raise into the caller.
self._note_drop(reason=f"translate_error: {exc}")
return
component = fdr_record.payload.get("component") or record.name
try:
client = self._resolver(component if isinstance(component, str) else record.name)
except Exception as exc:
self._note_drop(reason=f"resolve_error: {exc}")
return
result = client.enqueue(fdr_record)
if result == EnqueueResult.OVERRUN:
self._note_drop(reason="queue_saturated")
def _build_fdr_record(self, record: logging.LogRecord) -> FdrRecord:
rec_dict = record.__dict__
component = rec_dict.get("component") or record.name
kind = rec_dict.get("kind") or "log.diag"
frame_id = rec_dict.get("frame_id")
explicit_kv = rec_dict.get("kv")
if explicit_kv is None:
kv_raw: dict[str, Any] = {
k: v
for k, v in rec_dict.items()
if k not in _RESERVED_LOG_RECORD_KEYS and not k.startswith("_")
}
else:
kv_raw = dict(explicit_kv)
try:
kv_safe = _coerce_jsonable(kv_raw)
except (TypeError, ValueError) as exc:
kv_safe = {"_format_error": f"{type(exc).__name__}: {exc}"}
exc_text: str | None = None
if record.exc_info:
exc_text = logging.Formatter().formatException(record.exc_info)
return FdrRecord(
schema_version=CURRENT_SCHEMA_VERSION,
ts=_iso_utc(record.created),
producer_id=component,
kind="log",
payload={
"level": _normalise_level(record.levelname),
"component": component,
"frame_id": frame_id,
"kind": kind,
"msg": record.getMessage().replace("\n", " "),
"kv": kv_safe,
"exc": exc_text,
},
)
def _note_drop(self, *, reason: str) -> None:
"""Throttled stdout diagnostic; intentionally bypasses the logger."""
with self._drop_lock:
self._drop_counter += 1
if self._drop_counter % _DROP_LOG_EVERY_N != 1:
return
current = self._drop_counter
# stderr (not stdout) so log capture in tests doesn't confuse
# the bridge's diagnostic with normal application output.
print(
f"FdrLogBridgeHandler: dropped record #{current} (reason={reason})",
file=sys.stderr,
)
def wire_log_bridge(
fdr_client_resolver: _FdrLikeResolver,
*,
target_loggers: tuple[str, ...] = ("",),
level: int = logging.WARNING,
) -> FdrLogBridgeHandler:
"""Install a single :class:`FdrLogBridgeHandler` on ``target_loggers``.
Idempotent: re-calling replaces any prior bridge handler on the
same logger(s) — exactly one bridge per logger (AC-5).
Returns the installed handler so the composition root can keep a
handle for teardown (e.g. test isolation).
"""
handler = FdrLogBridgeHandler(fdr_client_resolver, level=level)
for name in target_loggers:
target = logging.getLogger(name)
target.handlers = [
h for h in target.handlers if not getattr(h, _BRIDGE_HANDLER_MARKER_ATTR, False)
]
target.addHandler(handler)
# Lower the logger's effective level if it is currently above WARN.
if target.level == logging.NOTSET or target.level > level:
target.setLevel(level)
return handler