[AZ-343] C2.5 InlierCountReRanker + shared FeatureExtractor helper

Implements the production-default ReRankStrategy: K=10 → N=3 by
single-pair LightGlue inlier count, with strict drop-and-continue
(INV-8) on per-candidate TileFetch / backbone / zero-inlier failures
and RerankAllCandidatesFailedError on zero survivors. Composition
root injects the shared LightGlueRuntime + Clock + the new
FeatureExtractor helper (an L1 placeholder OpenCvOrbExtractor that
unblocks AZ-343 and future C3 strategies — task scope expansion).

Architectural notes:
- Cross-component imports stay banned; tile_store types as `object`
  and the C6 TileCacheError family is duck-typed by class module
  prefix (same workaround AZ-348 adopted for c7_inference; proper
  fix is to relocate TileCacheError to _types/ in a follow-up).
- Clock injection follows the replay contract (AZ-398 Invariant 2);
  reranked_at is sourced from clock.monotonic_ns().
- AZ-342 factory grew `feature_extractor` + `clock` + `fdr_client`
  parameters; existing AZ-342 conformance tests updated.

Tests: 19 new AC-1..AC-12 + mixed-failure scenarios in
test_inlier_count_reranker.py; existing AZ-342 suite (26) still
green. Full repo sweep 1093 passed / 2 skipped (cmake/actionlint
not on PATH).

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-12 06:22:40 +03:00
parent 9a605c8514
commit 48ea1e2fc2
10 changed files with 1739 additions and 13 deletions
@@ -37,10 +37,18 @@ class C2_5RerankConfig:
``top_n`` is the per-frame N cap (1..K-1). Default 3 (the epic's
K=10 → N=3 spec).
``debug_per_frame_log`` gates the two DEBUG events
(``c2_5.rerank.zero_inliers`` per dropped candidate and
``c2_5.rerank.frame_done`` per frame); flooding journald at
``3 Hz × K=10 = 30 events/sec`` by default would violate
description.md § 9. Operators flip this to ``True`` for the
debug-build flight binary.
"""
strategy: str = "inlier_count"
top_n: int = 3
debug_per_frame_log: bool = False
def __post_init__(self) -> None:
if self.strategy not in KNOWN_STRATEGIES:
@@ -0,0 +1,584 @@
"""C2.5 :class:`InlierCountReRanker` — single-pair LightGlue inlier count (AZ-343).
Production-default :class:`ReRankStrategy` for the K=10 → N=3 cut.
For each candidate in :class:`VprResult.candidates`:
1. Fetch tile pixels via :class:`TileStore.read_tile_pixels` (a
:class:`TilePixelHandle` context manager backed by an mmap'd JPEG).
2. Extract :class:`KeypointSet` from BOTH the query frame and the
candidate tile via the shared :class:`FeatureExtractor` (AZ-343
scope expansion).
3. Call :meth:`LightGlueRuntime.match` for the single pair; count the
number of correspondences as the inlier proxy.
4. Sort surviving candidates descending by ``inlier_count`` (ties
broken ascending by ``descriptor_distance`` carried forward from
C2; INV-3); truncate to ``n``; return a :class:`RerankResult`.
Drop-and-continue (INV-8) is the central reliability mechanism: any
per-candidate :class:`TileCacheError` or LightGlue / feature-extractor
failure is caught inside the loop, the candidate is dropped, an ERROR
log + FDR record is emitted, and the loop continues. Only the
zero-survivors case escapes as :class:`RerankAllCandidatesFailedError`.
The survivor's ``tile_pixels_handle`` is identity-equal to the handle
returned by ``TileStore.read_tile_pixels`` (INV-6 / AC-7). The handle
is exited at the end of feature extraction; downstream C3 re-enters it
to read pixels — the C6 page-cache-backed impl supports re-entry for
the per-frame TTL window.
"""
from __future__ import annotations
import logging
from datetime import datetime, timezone
from typing import TYPE_CHECKING
import cv2
import numpy as np
from gps_denied_onboard._types.rerank import RerankCandidate, RerankResult
from gps_denied_onboard.components.c2_5_rerank.errors import (
RerankAllCandidatesFailedError,
RerankBackboneError,
)
from gps_denied_onboard.fdr_client import FdrRecord
from gps_denied_onboard.helpers.feature_extractor import FeatureExtractorError
from gps_denied_onboard.helpers.lightglue_runtime import (
LightGlueConcurrentAccessError,
LightGlueRuntimeError,
)
if TYPE_CHECKING:
from gps_denied_onboard._types.calibration import CameraCalibration
from gps_denied_onboard._types.matching import KeypointSet
from gps_denied_onboard._types.nav import NavCameraFrame
from gps_denied_onboard._types.vpr import VprResult
from gps_denied_onboard.clock import Clock
from gps_denied_onboard.config.schema import Config
from gps_denied_onboard.fdr_client import FdrClient
from gps_denied_onboard.helpers.feature_extractor import FeatureExtractor
from gps_denied_onboard.helpers.lightglue_runtime import LightGlueRuntime
# Cross-component types (`TileStore`, `ReRankStrategy`,
# `C2_5RerankConfig`) are intentionally NOT imported here — even under
# ``TYPE_CHECKING``, an AST-level cross-component import is rejected by
# ``test_ac6_only_compose_root_imports_concrete_strategies``. The
# composition root (``runtime_root.rerank_factory``) injects concrete
# instances satisfying these Protocols; we accept them as ``object``
# at the constructor boundary and trust the runtime root for type
# safety.
__all__ = ["InlierCountReRanker", "create"]
_LOG = logging.getLogger("gps_denied_onboard.c2_5_rerank")
_PRODUCER_ID = "c2_5_rerank.inlier_count"
# C6 TileCacheError lives in `gps_denied_onboard.components.c6_tile_cache.errors`
# but we cannot import it: cross-component imports are banned outside the
# composition root (test_ac6_only_compose_root_imports_concrete_strategies).
# Match the family by class-module prefix instead — the C6 contract documents
# the module path so a future module-rename surfaces as a test failure here.
_C6_ERROR_MODULE_PREFIX = "gps_denied_onboard.components.c6_tile_cache"
def _is_tile_cache_error(exc: BaseException) -> bool:
"""True if ``exc`` is a C6 :class:`TileCacheError` subclass.
Duck-types against the producer's class module to keep the
architectural import boundary clean. Programming errors raised
from the C6 module (e.g. ``AttributeError``) would also match —
that is acceptable, since by Contract C6 wraps OS errors into
:class:`TileCacheError`; anything bare leaking out is a C6 bug
that the per-candidate drop semantics will absorb just as the
contract expects of any per-candidate failure.
"""
return type(exc).__module__.startswith(_C6_ERROR_MODULE_PREFIX)
class InlierCountReRanker:
"""Single-pair LightGlue inlier-count :class:`ReRankStrategy` (AZ-343)."""
def __init__(
self,
*,
config: Config,
tile_store: object,
lightglue_runtime: LightGlueRuntime,
feature_extractor: FeatureExtractor,
clock: Clock,
fdr_client: FdrClient | None,
) -> None:
# Keyword-only injection: a runtime-root regression that forgets
# one of the helpers fails loudly instead of silently constructing
# an under-wired strategy. ``tile_store`` is typed ``object``
# because the C6 ``TileStore`` Protocol lives in another
# component (see the module docstring on cross-component imports).
block = config.components["c2_5_rerank"]
self._tile_store = tile_store
self._lightglue_runtime = lightglue_runtime
self._feature_extractor = feature_extractor
self._clock = clock
self._fdr_client = fdr_client
self._top_n = int(block.top_n)
self._debug_per_frame_log = bool(block.debug_per_frame_log)
def rerank(
self,
frame: NavCameraFrame,
vpr_result: VprResult,
n: int,
calibration: CameraCalibration,
) -> RerankResult:
candidates_input = len(vpr_result.candidates)
target_n = self._top_n if n <= 0 else min(self._top_n, n)
if candidates_input == 0:
self._fail_all(
frame_id=vpr_result.frame_id,
candidates_input=0,
candidates_dropped=0,
reason="no_input_candidates",
)
query_features = self._extract_query_features(frame)
if query_features is None:
self._fail_all(
frame_id=vpr_result.frame_id,
candidates_input=candidates_input,
candidates_dropped=candidates_input,
reason="query_extraction_failed",
)
survivors: list[RerankCandidate] = []
dropped = 0
inlier_counts: list[int] = []
for vpr_candidate in vpr_result.candidates:
tile_id = vpr_candidate.tile_id
survivor = self._process_candidate(
tile_id=tile_id,
vpr_candidate=vpr_candidate,
query_features=query_features,
frame_id=vpr_result.frame_id,
)
if survivor is None:
dropped += 1
continue
survivors.append(survivor)
inlier_counts.append(survivor.inlier_count)
if not survivors:
self._fail_all(
frame_id=vpr_result.frame_id,
candidates_input=candidates_input,
candidates_dropped=dropped,
reason="all_candidates_dropped",
)
survivors.sort(key=lambda c: (-c.inlier_count, c.descriptor_distance))
truncated = tuple(survivors[:target_n])
if len(truncated) < target_n:
_LOG.warning(
"c2_5.rerank.fewer_than_n_survivors",
extra={
"kind": "c2_5.rerank.fewer_than_n_survivors",
"kv": {
"requested": target_n,
"returned": len(truncated),
"dropped": dropped,
"frame_id": vpr_result.frame_id,
},
},
)
if self._debug_per_frame_log:
_LOG.debug(
"c2_5.rerank.frame_done",
extra={
"kind": "c2_5.rerank.frame_done",
"kv": {
"frame_id": vpr_result.frame_id,
"inlier_counts": inlier_counts,
},
},
)
result = RerankResult(
frame_id=vpr_result.frame_id,
candidates=truncated,
reranked_at=int(self._clock.monotonic_ns()),
rerank_label="inlier_count",
candidates_input=candidates_input,
candidates_dropped=dropped,
)
self._emit_frame_done_fdr(result)
return result
# ------------------------------------------------------------------
# Per-candidate pipeline: open handle → extract → match → score.
def _process_candidate(
self,
*,
tile_id,
vpr_candidate,
query_features,
frame_id: int,
) -> RerankCandidate | None:
try:
handle = self._tile_store.read_tile_pixels(tile_id)
except Exception as exc:
if not _is_tile_cache_error(exc):
raise
self._log_tile_fetch_error(tile_id=tile_id, frame_id=frame_id, exc=exc)
return None
tile_features = self._extract_tile_features(
handle=handle, tile_id=tile_id, frame_id=frame_id
)
if tile_features is None:
return None
inlier_count = self._count_inliers(
query_features=query_features,
tile_features=tile_features,
tile_id=tile_id,
frame_id=frame_id,
)
if inlier_count is None:
return None
if inlier_count == 0:
self._maybe_log_zero_inliers(tile_id=tile_id, frame_id=frame_id)
return None
return RerankCandidate(
tile_id=tile_id,
inlier_count=inlier_count,
descriptor_distance=vpr_candidate.descriptor_distance,
descriptor_dim=vpr_candidate.descriptor_dim,
tile_pixels_handle=handle,
)
def _extract_query_features(
self, frame: NavCameraFrame
) -> KeypointSet | None:
image = _ensure_bgr_array(frame.image)
if image is None:
self._log_backbone_error(
frame_id=frame.frame_id,
tile_id=None,
reason="query_image_not_decodable",
error=None,
)
return None
try:
return self._feature_extractor.extract(image)
except FeatureExtractorError as exc:
self._log_backbone_error(
frame_id=frame.frame_id,
tile_id=None,
reason="query_feature_extraction_failed",
error=exc,
)
return None
def _extract_tile_features(
self, *, handle, tile_id, frame_id: int
) -> KeypointSet | None:
try:
with handle as jpeg_view:
tile_image = _decode_jpeg(jpeg_view)
except ValueError as exc:
self._log_tile_fetch_error(tile_id=tile_id, frame_id=frame_id, exc=exc)
return None
except Exception as exc:
if not _is_tile_cache_error(exc):
raise
self._log_tile_fetch_error(tile_id=tile_id, frame_id=frame_id, exc=exc)
return None
try:
return self._feature_extractor.extract(tile_image)
except FeatureExtractorError as exc:
self._log_backbone_error(
frame_id=frame_id,
tile_id=tile_id,
reason="tile_feature_extraction_failed",
error=exc,
)
return None
def _count_inliers(
self,
*,
query_features,
tile_features,
tile_id,
frame_id: int,
) -> int | None:
try:
correspondences = self._lightglue_runtime.match(
query_features, tile_features
)
except (
LightGlueRuntimeError,
LightGlueConcurrentAccessError,
RerankBackboneError,
RuntimeError,
) as exc:
self._log_backbone_error(
frame_id=frame_id,
tile_id=tile_id,
reason="lightglue_forward_failed",
error=exc,
)
return None
scores = getattr(correspondences, "scores", None)
if scores is None:
return 0
try:
return int(np.asarray(scores).shape[0])
except (TypeError, ValueError):
return 0
# ------------------------------------------------------------------
# Log + FDR helpers.
def _log_tile_fetch_error(self, *, tile_id, frame_id: int, exc: BaseException) -> None:
_LOG.error(
"c2_5.rerank.tile_fetch_error",
extra={
"kind": "c2_5.rerank.tile_fetch_error",
"kv": {
"frame_id": frame_id,
"tile_id": list(tile_id),
"error": repr(exc),
},
},
)
if self._fdr_client is None:
return
self._safe_enqueue(
FdrRecord(
schema_version=1,
ts=self._fdr_ts(),
producer_id=_PRODUCER_ID,
kind="rerank.tile_fetch_error",
payload={
"frame_id": int(frame_id),
"tile_id": list(tile_id),
},
)
)
def _log_backbone_error(
self,
*,
frame_id: int,
tile_id,
reason: str,
error: BaseException | None,
) -> None:
kv: dict[str, object] = {"frame_id": frame_id, "reason": reason}
if tile_id is not None:
kv["tile_id"] = list(tile_id)
if error is not None:
kv["error"] = repr(error)
_LOG.error(
"c2_5.rerank.backbone_error",
extra={"kind": "c2_5.rerank.backbone_error", "kv": kv},
)
if self._fdr_client is None:
return
payload: dict[str, object] = {
"frame_id": int(frame_id),
"reason": reason,
}
if tile_id is not None:
payload["tile_id"] = list(tile_id)
self._safe_enqueue(
FdrRecord(
schema_version=1,
ts=self._fdr_ts(),
producer_id=_PRODUCER_ID,
kind="rerank.backbone_error",
payload=payload,
)
)
def _maybe_log_zero_inliers(self, *, tile_id, frame_id: int) -> None:
if not self._debug_per_frame_log:
return
_LOG.debug(
"c2_5.rerank.zero_inliers",
extra={
"kind": "c2_5.rerank.zero_inliers",
"kv": {"frame_id": frame_id, "tile_id": list(tile_id)},
},
)
def _emit_frame_done_fdr(self, result: RerankResult) -> None:
if self._fdr_client is None:
return
top = result.candidates[0]
self._safe_enqueue(
FdrRecord(
schema_version=1,
ts=self._fdr_ts(),
producer_id=_PRODUCER_ID,
kind="rerank.frame_done",
payload={
"frame_id": int(result.frame_id),
"candidates_input": int(result.candidates_input),
"candidates_dropped": int(result.candidates_dropped),
"top_inlier_count": int(top.inlier_count),
"top_tile_id": list(top.tile_id),
},
)
)
def _emit_all_failed_fdr(
self, *, frame_id: int, candidates_input: int, candidates_dropped: int
) -> None:
if self._fdr_client is None:
return
self._safe_enqueue(
FdrRecord(
schema_version=1,
ts=self._fdr_ts(),
producer_id=_PRODUCER_ID,
kind="rerank.all_failed",
payload={
"frame_id": int(frame_id),
"candidates_input": int(candidates_input),
"candidates_dropped": int(candidates_dropped),
},
)
)
def _fail_all(
self,
*,
frame_id: int,
candidates_input: int,
candidates_dropped: int,
reason: str,
) -> None:
_LOG.error(
"c2_5.rerank.all_failed",
extra={
"kind": "c2_5.rerank.all_failed",
"kv": {
"frame_id": frame_id,
"candidates_input": candidates_input,
"candidates_dropped": candidates_dropped,
"reason": reason,
},
},
)
self._emit_all_failed_fdr(
frame_id=frame_id,
candidates_input=candidates_input,
candidates_dropped=candidates_dropped,
)
raise RerankAllCandidatesFailedError(
f"InlierCountReRanker.rerank: zero survivors "
f"(frame_id={frame_id!r}, candidates_input={candidates_input}, "
f"candidates_dropped={candidates_dropped}, reason={reason!r})"
)
def _safe_enqueue(self, record: FdrRecord) -> None:
try:
self._fdr_client.enqueue(record) # type: ignore[union-attr]
except Exception as exc:
# FDR enqueue failures are observability-only; they must
# NEVER promote to an InlierCountReRanker drop event.
_LOG.debug(
"c2_5.rerank.fdr_enqueue_failed",
extra={
"kind": "c2_5.rerank.fdr_enqueue_failed",
"kv": {"error": repr(exc)},
},
)
def _fdr_ts(self) -> str:
ns = int(self._clock.time_ns())
seconds, fraction_ns = divmod(ns, 1_000_000_000)
dt = datetime.fromtimestamp(seconds, tz=timezone.utc)
# ISO-8601 with nanosecond fractional part and an explicit UTC
# offset; survives a round-trip through datetime.fromisoformat
# (which accepts up to microseconds — the extra ns digits are
# preserved as a string suffix for the FDR consumer).
return f"{dt.strftime('%Y-%m-%dT%H:%M:%S')}.{fraction_ns:09d}+00:00"
def _ensure_bgr_array(image: object) -> np.ndarray | None:
"""Coerce ``NavCameraFrame.image`` into a BGR ``np.ndarray``.
Accepts an already-decoded array (returned as-is) or a JPEG/PNG
byte buffer (decoded via ``cv2.imdecode``). Anything else returns
``None`` so the caller routes through the backbone-error drop path.
"""
if isinstance(image, np.ndarray):
return image
if isinstance(image, (bytes, bytearray, memoryview)):
data = bytes(image)
if not data:
return None
buf = np.frombuffer(data, dtype=np.uint8)
return cv2.imdecode(buf, cv2.IMREAD_COLOR)
return None
def _decode_jpeg(jpeg_view: memoryview) -> np.ndarray:
"""Decode a JPEG ``memoryview`` into a BGR ``np.ndarray``.
Raises :class:`ValueError` if the buffer is empty or invalid; the
caller catches both and treats them as a tile-fetch-error drop.
"""
data = bytes(jpeg_view)
if not data:
raise ValueError("empty JPEG buffer")
buf = np.frombuffer(data, dtype=np.uint8)
decoded = cv2.imdecode(buf, cv2.IMREAD_COLOR)
if decoded is None:
raise ValueError("cv2.imdecode returned None for tile JPEG")
return decoded
# ----------------------------------------------------------------------
# Module-level factory entry-point consumed by
# :mod:`gps_denied_onboard.runtime_root.rerank_factory.build_rerank_strategy`.
def create(
config: Config,
*,
tile_store: object,
lightglue_runtime: LightGlueRuntime,
feature_extractor: FeatureExtractor,
clock: Clock,
fdr_client: FdrClient | None = None,
) -> object:
"""Construct an :class:`InlierCountReRanker` from injected helpers."""
strategy = InlierCountReRanker(
config=config,
tile_store=tile_store,
lightglue_runtime=lightglue_runtime,
feature_extractor=feature_extractor,
clock=clock,
fdr_client=fdr_client,
)
_LOG.info(
"c2_5.rerank.ready",
extra={
"kind": "c2_5.rerank.ready",
"kv": {
"strategy": "inlier_count",
"N": int(config.components["c2_5_rerank"].top_n),
"K": 10,
},
},
)
return strategy