mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-22 23:41:13 +00:00
[AZ-273] [AZ-274] [AZ-275] [AZ-267] [AZ-268] FDR producer chain + log bridge + contract test
AZ-273: lock-free SPSC ring buffer with pre-allocated slots, power-of- two capacity, opt-in SPSC guard, and EnqueueResult / FdrSpscViolationError on the public surface. make_fdr_client caches one client per producer_id and reads capacity from config.fdr.per_producer_capacity with fallback to queue_size. AZ-274: default_overrun_policy implements drop-oldest + retry + immediate marker emission, with prior-marker dropped_count folding via _evict_one so user-loss info is never lost across iterations. ERROR diagnostic is rate-limited to <=1/sec per producer. AZ-275: FakeFdrSink mirrors the FdrClient public surface and reuses the production default_overrun_policy via a duck-typed _PolicyAdapter. The test-only records/all_records_ever properties let component tests assert both in-buffer and lifetime state. tests/conftest.py registers the fake_fdr_sink fixture and an AST architecture lint forbids production imports of fakes. AZ-267: FdrLogBridgeHandler installs on the root logger via wire_log_bridge and forwards only WARN+ERROR records into the FDR with kind="log". Thread-local recursion guard short-circuits internal logging; saturated- queue diagnostics go to stderr every N=1000 drops. AZ-268: tests/contract/log_schema.py covers every row of the schema's Test Cases table plus the "DEBUG+INFO never reach FDR" invariant. pyproject.toml registers the contract pytest marker and the contract-mandated log_schema.py file-name. 251 unit + contract tests pass (48 new). Review verdict: PASS_WITH_WARNINGS; findings are NFR-perf deferrals + documented relaxation of AZ-274 AC-2 coalescing under permanently-stalled consumer. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,263 @@
|
||||
"""FDR log bridge — forwards WARN + ERROR records into the FDR (AZ-267).
|
||||
|
||||
A :class:`logging.Handler` subclass installed on the root onboard logger
|
||||
(or each named logger). For every emitted record at level WARN or
|
||||
ERROR, it builds a ``kind="log"`` :class:`FdrRecord` carrying the
|
||||
schema-mandated 7 fields and enqueues into a producer-side
|
||||
:class:`FdrClient`. INFO and DEBUG records are dropped at the handler's
|
||||
level filter — they never reach the FDR (AC-3).
|
||||
|
||||
Public surface frozen by
|
||||
``_docs/02_document/contracts/shared_logging/log_record_schema.md``
|
||||
v1.0.0 (the wire shape of the resulting record's payload) and
|
||||
``_docs/02_document/contracts/shared_fdr_client/fdr_client_protocol.md``
|
||||
v1.0.0 (the queue we enqueue into).
|
||||
|
||||
Concurrency
|
||||
-----------
|
||||
|
||||
* The handler may run on any logger thread.
|
||||
* A thread-local recursion guard short-circuits any logging call that
|
||||
originates from inside the handler itself — without this guard, a
|
||||
failure path that emits a diagnostic WARN would recurse through the
|
||||
same handler indefinitely.
|
||||
* Saturated-queue diagnostics throttle to 1-per-``_DROP_LOG_EVERY_N``
|
||||
occurrences via stdout (not via the bridge) to avoid the same loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as _dt
|
||||
import logging
|
||||
import sys
|
||||
import threading
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Final
|
||||
|
||||
from gps_denied_onboard.fdr_client.client import EnqueueResult
|
||||
from gps_denied_onboard.fdr_client.records import (
|
||||
CURRENT_SCHEMA_VERSION,
|
||||
FdrRecord,
|
||||
)
|
||||
|
||||
# The resolver's return value must expose ``enqueue(record) -> str``; both
|
||||
# the production ``FdrClient`` and the test ``FakeFdrSink`` satisfy that.
|
||||
# Typed as ``Any`` here to avoid pulling FakeFdrSink (a tests-only fake)
|
||||
# into the production module's type contract.
|
||||
_FdrLikeResolver = Callable[[str], Any]
|
||||
|
||||
__all__ = [
|
||||
"FdrLogBridgeHandler",
|
||||
"wire_log_bridge",
|
||||
]
|
||||
|
||||
|
||||
_BRIDGE_HANDLER_MARKER_ATTR: Final[str] = "_gps_denied_fdr_bridge_handler"
|
||||
|
||||
# Throttle the stdout "queue saturated" diagnostic to one in every N
|
||||
# occurrences per NFR-reliability ("at least N>=1000").
|
||||
_DROP_LOG_EVERY_N: Final[int] = 1000
|
||||
|
||||
|
||||
# Level-name normalisation matching the schema contract (``WARNING`` ->
|
||||
# ``WARN``). Mirrors the logic in ``logging.structured._normalise_level``;
|
||||
# duplicated locally to keep this module self-contained and avoid a
|
||||
# circular import.
|
||||
def _normalise_level(stdlib_levelname: str) -> str:
|
||||
if stdlib_levelname == "WARNING":
|
||||
return "WARN"
|
||||
return stdlib_levelname
|
||||
|
||||
|
||||
def _iso_utc(created_epoch: float) -> str:
|
||||
"""RFC 3339 UTC with microsecond precision + ``Z`` suffix."""
|
||||
dt = _dt.datetime.fromtimestamp(created_epoch, tz=_dt.timezone.utc)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%S.") + f"{dt.microsecond:06d}Z"
|
||||
|
||||
|
||||
# Reserved stdlib LogRecord attributes that must not leak into the FDR
|
||||
# record's payload.kv. Mirrors the same list in
|
||||
# ``logging.structured._RESERVED_LOG_RECORD_KEYS``.
|
||||
_RESERVED_LOG_RECORD_KEYS: Final[frozenset[str]] = frozenset(
|
||||
{
|
||||
"args",
|
||||
"asctime",
|
||||
"created",
|
||||
"exc_info",
|
||||
"exc_text",
|
||||
"filename",
|
||||
"funcName",
|
||||
"levelname",
|
||||
"levelno",
|
||||
"lineno",
|
||||
"message",
|
||||
"module",
|
||||
"msecs",
|
||||
"msg",
|
||||
"name",
|
||||
"pathname",
|
||||
"process",
|
||||
"processName",
|
||||
"relativeCreated",
|
||||
"stack_info",
|
||||
"taskName",
|
||||
"thread",
|
||||
"threadName",
|
||||
"frame_id",
|
||||
"kind",
|
||||
"kv",
|
||||
"component",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _coerce_jsonable(value: Any) -> Any:
|
||||
"""Mirror of the formatter's coercer; raises on non-JSON-safe types."""
|
||||
if isinstance(value, (str, int, float, bool)) or value is None:
|
||||
return value
|
||||
if isinstance(value, dict):
|
||||
return {str(k): _coerce_jsonable(v) for k, v in value.items()}
|
||||
if isinstance(value, (list, tuple)):
|
||||
return [_coerce_jsonable(v) for v in value]
|
||||
raise TypeError(f"unserialisable kv payload type: {type(value).__name__}")
|
||||
|
||||
|
||||
class FdrLogBridgeHandler(logging.Handler):
|
||||
"""Forwards WARN + ERROR :class:`logging.LogRecord` into the FDR.
|
||||
|
||||
Constructed with a callable that resolves to the per-record
|
||||
:class:`FdrClient`. The callable lets the composition root inject
|
||||
either a per-component sink (one FdrClient per component) or a
|
||||
shared "log fan-out" client; either way the bridge stays decoupled
|
||||
from the registry.
|
||||
"""
|
||||
|
||||
_recursion_local = threading.local()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fdr_client_resolver: _FdrLikeResolver,
|
||||
*,
|
||||
level: int = logging.WARNING,
|
||||
) -> None:
|
||||
super().__init__(level=level)
|
||||
if not callable(fdr_client_resolver):
|
||||
raise TypeError(f"fdr_client_resolver must be callable; got {fdr_client_resolver!r}")
|
||||
self._resolver: _FdrLikeResolver = fdr_client_resolver
|
||||
self._drop_counter: int = 0
|
||||
self._drop_lock: threading.Lock = threading.Lock()
|
||||
setattr(self, _BRIDGE_HANDLER_MARKER_ATTR, True)
|
||||
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
if getattr(self._recursion_local, "in_bridge", False):
|
||||
# Short-circuit: a log call originating from inside the
|
||||
# bridge (saturated-queue diagnostic, etc.) must NOT loop.
|
||||
return
|
||||
self._recursion_local.in_bridge = True
|
||||
try:
|
||||
self._emit_unguarded(record)
|
||||
finally:
|
||||
self._recursion_local.in_bridge = False
|
||||
|
||||
def _emit_unguarded(self, record: logging.LogRecord) -> None:
|
||||
try:
|
||||
fdr_record = self._build_fdr_record(record)
|
||||
except Exception as exc:
|
||||
# Translation failed (e.g. non-serialisable kv that even the
|
||||
# formatter's fallback didn't catch). Skip this record;
|
||||
# never raise into the caller.
|
||||
self._note_drop(reason=f"translate_error: {exc}")
|
||||
return
|
||||
|
||||
component = fdr_record.payload.get("component") or record.name
|
||||
try:
|
||||
client = self._resolver(component if isinstance(component, str) else record.name)
|
||||
except Exception as exc:
|
||||
self._note_drop(reason=f"resolve_error: {exc}")
|
||||
return
|
||||
|
||||
result = client.enqueue(fdr_record)
|
||||
if result == EnqueueResult.OVERRUN:
|
||||
self._note_drop(reason="queue_saturated")
|
||||
|
||||
def _build_fdr_record(self, record: logging.LogRecord) -> FdrRecord:
|
||||
rec_dict = record.__dict__
|
||||
component = rec_dict.get("component") or record.name
|
||||
kind = rec_dict.get("kind") or "log.diag"
|
||||
frame_id = rec_dict.get("frame_id")
|
||||
|
||||
explicit_kv = rec_dict.get("kv")
|
||||
if explicit_kv is None:
|
||||
kv_raw: dict[str, Any] = {
|
||||
k: v
|
||||
for k, v in rec_dict.items()
|
||||
if k not in _RESERVED_LOG_RECORD_KEYS and not k.startswith("_")
|
||||
}
|
||||
else:
|
||||
kv_raw = dict(explicit_kv)
|
||||
|
||||
try:
|
||||
kv_safe = _coerce_jsonable(kv_raw)
|
||||
except (TypeError, ValueError) as exc:
|
||||
kv_safe = {"_format_error": f"{type(exc).__name__}: {exc}"}
|
||||
|
||||
exc_text: str | None = None
|
||||
if record.exc_info:
|
||||
exc_text = logging.Formatter().formatException(record.exc_info)
|
||||
|
||||
return FdrRecord(
|
||||
schema_version=CURRENT_SCHEMA_VERSION,
|
||||
ts=_iso_utc(record.created),
|
||||
producer_id=component,
|
||||
kind="log",
|
||||
payload={
|
||||
"level": _normalise_level(record.levelname),
|
||||
"component": component,
|
||||
"frame_id": frame_id,
|
||||
"kind": kind,
|
||||
"msg": record.getMessage().replace("\n", " "),
|
||||
"kv": kv_safe,
|
||||
"exc": exc_text,
|
||||
},
|
||||
)
|
||||
|
||||
def _note_drop(self, *, reason: str) -> None:
|
||||
"""Throttled stdout diagnostic; intentionally bypasses the logger."""
|
||||
with self._drop_lock:
|
||||
self._drop_counter += 1
|
||||
if self._drop_counter % _DROP_LOG_EVERY_N != 1:
|
||||
return
|
||||
current = self._drop_counter
|
||||
# stderr (not stdout) so log capture in tests doesn't confuse
|
||||
# the bridge's diagnostic with normal application output.
|
||||
print(
|
||||
f"FdrLogBridgeHandler: dropped record #{current} (reason={reason})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
def wire_log_bridge(
|
||||
fdr_client_resolver: _FdrLikeResolver,
|
||||
*,
|
||||
target_loggers: tuple[str, ...] = ("",),
|
||||
level: int = logging.WARNING,
|
||||
) -> FdrLogBridgeHandler:
|
||||
"""Install a single :class:`FdrLogBridgeHandler` on ``target_loggers``.
|
||||
|
||||
Idempotent: re-calling replaces any prior bridge handler on the
|
||||
same logger(s) — exactly one bridge per logger (AC-5).
|
||||
|
||||
Returns the installed handler so the composition root can keep a
|
||||
handle for teardown (e.g. test isolation).
|
||||
"""
|
||||
handler = FdrLogBridgeHandler(fdr_client_resolver, level=level)
|
||||
for name in target_loggers:
|
||||
target = logging.getLogger(name)
|
||||
target.handlers = [
|
||||
h for h in target.handlers if not getattr(h, _BRIDGE_HANDLER_MARKER_ATTR, False)
|
||||
]
|
||||
target.addHandler(handler)
|
||||
# Lower the logger's effective level if it is currently above WARN.
|
||||
if target.level == logging.NOTSET or target.level > level:
|
||||
target.setLevel(level)
|
||||
return handler
|
||||
Reference in New Issue
Block a user