[AZ-308] c6 CacheBudgetEnforcer: 10 GB hard cap + LRU sweep

CacheBudgetEnforcer.reserve_headroom(needed_bytes) returns immediately
when total_disk_bytes() + needed_bytes <= budget, otherwise iterates
lru_candidates in eviction_batch_size batches, deletes via delete_tile,
emits one INFO log per evicted tile (c6.evicted) and one FDR record per
eviction batch (c6.eviction_batch, evicted_tile_ids capped to 5).
Raises CacheBudgetExhaustedError AFTER a full sweep if the budget
cannot be met. BudgetEnforcedTileStore decorates a TileStore so the
policy stays separable from PostgresFilesystemStore. Composition root
in storage_factory.build_tile_store wires the wrapper unconditionally.

PostgresFilesystemStore now accepts lru_clock: Clock | None = None;
when set, read_tile_pixels calls record_lru_access(tile_id, now) so
eviction picks the right LRU candidates. Production wiring injects
WallClock(); AZ-305 unit tests still construct without the clock and
keep their pass-through semantics. Contract tile_store.md bumped to
v1.1.0 to add CacheBudgetExhaustedError to the TileCacheError family;
shared FDR schema bumped to v1.3.0 for the new c6.eviction_batch kind.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-12 20:37:41 +03:00
parent 39ff47087f
commit d571ca25f9
13 changed files with 1588 additions and 29 deletions
@@ -0,0 +1,448 @@
"""C6 cache-budget enforcer (AZ-308).
RESTRICT-SAT-2 enforcement: the on-disk tile cache MUST NOT exceed a
configurable hard cap (default 10 GiB). Every :meth:`TileStore.write_tile`
goes through the :class:`BudgetEnforcedTileStore` decorator which calls
:meth:`CacheBudgetEnforcer.reserve_headroom` BEFORE the underlying store
writes a byte to disk. If the cap would be breached, the enforcer runs
an LRU sweep using the store's :meth:`TileMetadataStore.lru_candidates`
+ :meth:`TileStore.delete_tile` primitives (AZ-305) until enough
head-room is freed; if even total eviction cannot fit ``needed_bytes``
it raises :class:`CacheBudgetExhaustedError` AFTER the loop completes
(partial eviction is preferable to no eviction — AC-5).
The enforcer is the SOLE eviction path during a flight: no other
component evicts tiles (Reliability constraint of AZ-308). Per-eviction
INFO logs (``kind="c6.evicted"``) carry the tile-level detail; the
per-batch FDR record (``kind="c6.eviction_batch"``) is bounded to the
first 5 evicted ids (AC-11) so the F4 producer never blows the FDR
ring with a runaway sweep.
The decorator pattern is mandatory — moving the budget check inside
``PostgresFilesystemStore.write_tile`` would couple policy to the
filesystem impl and break the single-responsibility design that lets
the store remain unit-testable in isolation.
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Final
from gps_denied_onboard.components.c6_tile_cache._types import (
TileId,
TileMetadata,
)
from gps_denied_onboard.components.c6_tile_cache.errors import (
CacheBudgetExhaustedError,
TileCacheError,
TileFsError,
TileMetadataError,
)
from gps_denied_onboard.components.c6_tile_cache.interface import (
TileMetadataStore,
TileStore,
)
from gps_denied_onboard.fdr_client.records import CURRENT_SCHEMA_VERSION, FdrRecord
if TYPE_CHECKING:
from gps_denied_onboard.components.c6_tile_cache._tile_pixel_handle import (
TilePixelHandle,
)
from gps_denied_onboard.fdr_client.client import FdrClient
__all__ = [
"BudgetEnforcedTileStore",
"CacheBudgetEnforcer",
"EvictionResult",
]
_PRODUCER_ID: Final[str] = "c6_tile_cache.budget"
_FDR_TILE_IDS_CAP: Final[int] = 5
@dataclass(frozen=True)
class EvictionResult:
"""Outcome of one :meth:`CacheBudgetEnforcer.reserve_headroom` call.
``evicted`` is the ordered list of :class:`TileMetadata` rows that
were removed; empty if the no-eviction fast path fired. ``freed_bytes``
is the integer total reclaimed (each candidate's ``disk_bytes``).
"""
evicted: list[TileMetadata]
freed_bytes: int
def _iso_ts_now() -> str:
"""RFC 3339 UTC timestamp with microsecond precision and ``Z`` suffix.
Used only on the FDR record envelope ``ts`` field — distinct from the
per-row ``accessed_at`` / ``evicted_at`` datetimes which use the same
wall-clock source but carry the operator-facing semantics.
"""
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
class CacheBudgetEnforcer:
"""LRU-driven 10 GiB hard-cap enforcer for the C6 tile cache.
Construction reads :meth:`TileMetadataStore.total_disk_bytes` once
to emit a startup INFO log (AC-12) and warns if the prior flight
ended over-budget. Construction does NOT proactively evict — the
first :meth:`reserve_headroom` call drives any required sweep.
The enforcer holds no per-flight state of its own; the store is the
source of truth. This keeps the enforcer trivially restartable and
means a mid-flight process restart resumes with whatever disk
state the prior process left behind (subject to AZ-305's startup
orphan-reconciliation scan).
"""
def __init__(
self,
*,
store: TileMetadataStore,
fdr_client: FdrClient,
logger: logging.Logger,
budget_bytes: int,
eviction_batch_size: int = 32,
) -> None:
if budget_bytes <= 0:
raise TileMetadataError(
f"CacheBudgetEnforcer: budget_bytes must be > 0; got {budget_bytes}"
)
if eviction_batch_size <= 0:
raise TileMetadataError(
f"CacheBudgetEnforcer: eviction_batch_size must be > 0; got {eviction_batch_size}"
)
self._store = store
# The runtime ``PostgresFilesystemStore`` instance satisfies both
# the metadata-store and tile-store Protocols; the type hint stays
# ``TileMetadataStore`` for clarity but we duck-type ``delete_tile``
# off the same reference (AZ-308 spec § Outcome).
self._tile_store: TileStore = store # type: ignore[assignment]
self._fdr_client = fdr_client
self._logger = logger
self._budget_bytes = budget_bytes
self._eviction_batch_size = eviction_batch_size
current = self._store.total_disk_bytes()
headroom = max(self._budget_bytes - current, 0)
self._logger.info(
"c6.budget.loaded",
extra={
"kind": "c6.budget.loaded",
"kv": {
"budget_bytes": self._budget_bytes,
"current_disk_bytes": current,
"headroom_bytes": headroom,
"eviction_batch_size": self._eviction_batch_size,
},
},
)
if current > self._budget_bytes:
# AC-12: prior flight left the cache over-budget; surface
# the overage so operators can choose whether to inspect
# before the first F4 burst triggers cascade eviction.
self._logger.warning(
"c6.budget.over_budget_at_construction",
extra={
"kind": "c6.budget.over_budget_at_construction",
"kv": {
"budget_bytes": self._budget_bytes,
"current_disk_bytes": current,
"overage_bytes": current - self._budget_bytes,
},
},
)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
@property
def budget_bytes(self) -> int:
return self._budget_bytes
@property
def eviction_batch_size(self) -> int:
return self._eviction_batch_size
def reserve_headroom(
self,
needed_bytes: int,
*,
trigger_tile_id: TileId | None = None,
) -> EvictionResult:
"""Ensure at least ``needed_bytes`` of head-room.
Reads :meth:`total_disk_bytes` once, computes the available
head-room, and either:
- returns an empty :class:`EvictionResult` if there is already
room (AC-1, the no-evict fast path), or
- sweeps LRU candidates in batches of ``eviction_batch_size``
until ``freed_bytes >= shortfall`` (AC-2, AC-3, AC-4), or
- raises :class:`CacheBudgetExhaustedError` AFTER the sweep
exhausts all candidates without reaching the target (AC-5).
"""
if needed_bytes < 0:
raise TileMetadataError(
f"reserve_headroom: needed_bytes must be >= 0; got {needed_bytes}"
)
current = self._store.total_disk_bytes()
available = self._budget_bytes - current
if available >= needed_bytes:
return EvictionResult(evicted=[], freed_bytes=0)
shortfall = needed_bytes - available
evicted_metadata: list[TileMetadata] = []
freed_bytes = 0
while freed_bytes < shortfall:
candidates = self._store.lru_candidates(max_count=self._eviction_batch_size)
if not candidates:
break
for persistent in candidates:
if freed_bytes >= shortfall:
break
md = persistent.metadata
evicted_at = datetime.now(timezone.utc)
try:
removed = self._tile_store.delete_tile(md.tile_id)
except TileFsError as exc:
# Row delete succeeded (AZ-305 contract) but the
# filesystem unlink failed; the budget already
# reflects the row's absence so we still count
# disk_bytes per the spec § Exclusions.
removed = True
self._logger.warning(
"c6.evict.fs_error",
extra={
"kind": "c6.evict.fs_error",
"kv": {
"tile_id_str": str(md.tile_id),
"disk_bytes": persistent.disk_bytes,
"error": str(exc),
},
},
)
if not removed:
# NFR-reliability-delete-already-gone: a concurrent
# path already evicted this row. Count the bytes
# anyway (per spec § Exclusions) so the loop makes
# progress; the next ``lru_candidates`` call won't
# return this id.
self._logger.info(
"c6.evict.already_gone",
extra={
"kind": "c6.evict.already_gone",
"kv": {
"tile_id_str": str(md.tile_id),
"disk_bytes": persistent.disk_bytes,
},
},
)
evicted_metadata.append(md)
freed_bytes += persistent.disk_bytes
self._logger.info(
"c6.evicted",
extra={
"kind": "c6.evicted",
"kv": {
"tile_id_str": str(md.tile_id),
"disk_bytes": persistent.disk_bytes,
"accessed_at": persistent.accessed_at.isoformat(),
"evicted_at": evicted_at.isoformat(),
},
},
)
if evicted_metadata:
self._emit_eviction_batch(
trigger_tile_id=trigger_tile_id,
evicted=evicted_metadata,
freed_bytes=freed_bytes,
)
if freed_bytes < shortfall:
available_post = available + freed_bytes
raise CacheBudgetExhaustedError(
f"CacheBudgetEnforcer: cannot reserve {needed_bytes} bytes — "
f"available_bytes={available_post} after evicting "
f"{len(evicted_metadata)} tiles (freed {freed_bytes} bytes); "
f"budget_bytes={self._budget_bytes}",
needed_bytes=needed_bytes,
available_bytes=available_post,
evicted_count=len(evicted_metadata),
)
return EvictionResult(evicted=evicted_metadata, freed_bytes=freed_bytes)
# ------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------
def _emit_eviction_batch(
self,
*,
trigger_tile_id: TileId | None,
evicted: list[TileMetadata],
freed_bytes: int,
) -> None:
capped_ids = [str(md.tile_id) for md in evicted[:_FDR_TILE_IDS_CAP]]
self._fdr_client.enqueue(
FdrRecord(
schema_version=CURRENT_SCHEMA_VERSION,
ts=_iso_ts_now(),
producer_id=_PRODUCER_ID,
kind="c6.eviction_batch",
payload={
"trigger_tile_id": str(trigger_tile_id) if trigger_tile_id else "",
"freed_bytes": freed_bytes,
"evicted_count": len(evicted),
"evicted_tile_ids": capped_ids,
},
)
)
class BudgetEnforcedTileStore:
"""Decorator wrapping a :class:`TileStore` with pre-write budget check.
Implements the :class:`TileStore` Protocol — :meth:`write_tile` adds
the :meth:`CacheBudgetEnforcer.reserve_headroom` step before
delegating; the other three methods pass through unchanged so the
decorator is transparent to read-side consumers.
Crucially, the decorator does NOT swallow or rewrap exceptions from
the wrapped store (AC-7). Every :class:`TileCacheError` subclass
raised by the underlying ``write_tile`` (content-hash mismatch,
freshness rejection, fs/metadata errors) propagates unchanged so
the F4 producer's existing error-handling stays correct.
"""
def __init__(
self,
*,
wrapped: TileStore,
enforcer: CacheBudgetEnforcer,
) -> None:
self._wrapped = wrapped
self._enforcer = enforcer
def read_tile_pixels(self, tile_id: TileId) -> TilePixelHandle:
return self._wrapped.read_tile_pixels(tile_id)
def write_tile(self, tile_blob: bytes, metadata: TileMetadata) -> None:
# Pre-write budget check (AC-6). On eviction, the freed disk
# space is committed BEFORE the wrapped store opens its write
# transaction — there is no window where the budget is
# transiently breached.
self._enforcer.reserve_headroom(len(tile_blob), trigger_tile_id=metadata.tile_id)
self._wrapped.write_tile(tile_blob, metadata)
def tile_exists(self, tile_id: TileId) -> bool:
return self._wrapped.tile_exists(tile_id)
def delete_tile(self, tile_id: TileId) -> bool:
return self._wrapped.delete_tile(tile_id)
# ----------------------------------------------------------------------
# Operator CLI — `python -m c6_tile_cache.cache_budget_enforcer dry-run ...`
# ----------------------------------------------------------------------
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="c6_tile_cache.cache_budget_enforcer",
description=(
"Operator-side dry-run of the cache-budget enforcer. Reports "
"what the LRU sweep WOULD evict to make room for a hypothetical "
"write of N bytes, without actually deleting anything."
),
)
sub = parser.add_subparsers(dest="cmd", required=True)
dry = sub.add_parser(
"dry-run",
help="Show what would be evicted for a given needed-bytes target.",
)
dry.add_argument(
"--pretend-needed-bytes",
type=int,
required=True,
help="Hypothetical incoming write size (bytes).",
)
return parser
def _dry_run(args: argparse.Namespace) -> int:
from gps_denied_onboard.components.c6_tile_cache.postgres_filesystem_store import (
PostgresFilesystemStore,
)
from gps_denied_onboard.config import load_config
config = load_config(os.environ)
block = config.components["c6_tile_cache"]
store = PostgresFilesystemStore.from_config(config)
current = store.total_disk_bytes()
budget = block.lru_eviction_threshold_bytes
available = budget - current
needed = args.pretend_needed_bytes
print(f"budget_bytes: {budget}")
print(f"current_disk_bytes: {current}")
print(f"available_bytes: {available}")
print(f"needed_bytes: {needed}")
if available >= needed:
print("decision: NO_EVICTION (already enough head-room)")
return 0
shortfall = needed - available
print(f"shortfall_bytes: {shortfall}")
# Walk the LRU snapshot in the same batch shape the enforcer uses
# so the dry-run reports exactly what the real sweep would evict.
print(f"eviction_batch_size: {block.eviction_batch_size}")
print(f"would_evict (up to {block.eviction_batch_size}):")
freed = 0
count = 0
for persistent in store.lru_candidates(max_count=block.eviction_batch_size):
if freed >= shortfall:
break
print(
f" - tile_id={persistent.metadata.tile_id} "
f"accessed_at={persistent.accessed_at.isoformat()} "
f"disk_bytes={persistent.disk_bytes}"
)
freed += persistent.disk_bytes
count += 1
print(f"would_free_bytes: {freed}")
print(f"would_evict_count: {count}")
if freed < shortfall:
print("decision: WOULD_RAISE_CacheBudgetExhaustedError (not enough candidates)")
else:
print("decision: EVICT (sweep stops as soon as shortfall is covered)")
return 0
def main(argv: list[str] | None = None) -> int:
parser = _build_parser()
args = parser.parse_args(argv)
if args.cmd == "dry-run":
try:
return _dry_run(args)
except TileCacheError as exc:
print(f"error: {exc}", file=sys.stderr)
return 1
parser.error(f"unknown subcommand {args.cmd!r}")
return 2 # unreachable; argparse exits non-zero on error
if __name__ == "__main__":
sys.exit(main())
@@ -59,6 +59,7 @@ class C6TileCacheConfig:
postgres_dsn: str = ""
postgres_pool_size: int = 4
lru_eviction_threshold_bytes: int = 10 * 1024**3
eviction_batch_size: int = 32
def __post_init__(self) -> None:
if self.store_runtime not in KNOWN_TILE_STORE_RUNTIMES:
@@ -88,3 +89,7 @@ class C6TileCacheConfig:
f"C6TileCacheConfig.lru_eviction_threshold_bytes must be > 0; "
f"got {self.lru_eviction_threshold_bytes}"
)
if self.eviction_batch_size <= 0:
raise ConfigError(
f"C6TileCacheConfig.eviction_batch_size must be > 0; got {self.eviction_batch_size}"
)
@@ -22,6 +22,7 @@ if TYPE_CHECKING:
)
__all__ = [
"CacheBudgetExhaustedError",
"ContentHashMismatchError",
"FreshnessRejectionError",
"IndexBuildError",
@@ -107,6 +108,35 @@ class FreshnessRejectionError(TileCacheError):
self.rule = rule
class CacheBudgetExhaustedError(TileCacheError):
"""The cache-budget enforcer cannot make room for a new write.
Raised by :class:`CacheBudgetEnforcer.reserve_headroom` AFTER the
LRU sweep has run to completion and still could not free
``needed_bytes``. Partial eviction is preferable to no eviction
even when the budget cannot be met, so the loop runs first and the
error fires last (AZ-308 AC-5).
Diagnostic fields are populated for the F4 producer / operator
recovery path: ``needed_bytes`` is what the failed write asked
for; ``available_bytes`` is what was left after the sweep;
``evicted_count`` is how many tiles the sweep removed.
"""
def __init__(
self,
message: str,
*,
needed_bytes: int | None = None,
available_bytes: int | None = None,
evicted_count: int | None = None,
) -> None:
super().__init__(message)
self.needed_bytes = needed_bytes
self.available_bytes = available_bytes
self.evicted_count = evicted_count
class IndexUnavailableError(TileCacheError):
"""The descriptor index could not satisfy a read.
@@ -83,6 +83,7 @@ from gps_denied_onboard.helpers.sha256_sidecar import (
from gps_denied_onboard.helpers.wgs_converter import WgsConverter
if TYPE_CHECKING:
from gps_denied_onboard.clock.interface import Clock
from gps_denied_onboard.config.schema import Config
__all__ = ["MmapTilePixelHandle", "PostgresFilesystemStore"]
@@ -182,6 +183,7 @@ class PostgresFilesystemStore:
fdr_client: FdrClient,
logger: logging.Logger,
freshness_gate: FreshnessGate | None = None,
lru_clock: Clock | None = None,
) -> None:
self._root_dir = Path(root_dir)
self._tiles_dir = self._root_dir / "tiles"
@@ -194,6 +196,11 @@ class PostgresFilesystemStore:
# ``None`` keeps the AZ-305-only test path working (no gate wiring
# required for unit tests of the store in isolation).
self._freshness_gate = freshness_gate
# AZ-308 AC-8: when injected, every ``read_tile_pixels`` call
# records an LRU access stamp so :meth:`lru_candidates` picks
# accurate eviction targets. ``None`` keeps AZ-305 unit-test
# wiring (no LRU UPDATE on read) intact.
self._lru_clock = lru_clock
try:
self._tiles_dir.mkdir(parents=True, exist_ok=True)
except OSError as exc:
@@ -268,6 +275,7 @@ class PostgresFilesystemStore:
fdr_client=fdr_client,
logger=logger,
freshness_gate=freshness_gate,
lru_clock=WallClock(),
)
# ------------------------------------------------------------------
@@ -322,6 +330,15 @@ class PostgresFilesystemStore:
},
)
raise TileMetadataError(msg)
# AZ-308 AC-8: refresh the LRU clock on every read so eviction
# picks the actually-coldest tiles. The UPDATE runs only when the
# Clock was injected at construction (production wiring); AZ-305
# unit tests pass ``lru_clock=None`` and pay zero cost.
if self._lru_clock is not None:
now_dt = datetime.fromtimestamp(
self._lru_clock.time_ns() / 1_000_000_000, tz=timezone.utc
)
self.record_lru_access(tile_id, now_dt)
return MmapTilePixelHandle(path)
def write_tile(self, tile_blob: bytes, metadata: TileMetadata) -> None:
@@ -145,6 +145,19 @@ KNOWN_PAYLOAD_KEYS: Final[dict[str, frozenset[str]]] = {
"c6.freshness.downgraded": frozenset(
{"tile_id", "age_seconds", "classification", "rule_action", "rule_max_age_seconds"}
),
# AZ-308 / E-C6: emitted by the CacheBudgetEnforcer at the end of every
# LRU sweep that evicted at least one tile (RESTRICT-SAT-2). One record
# per ``reserve_headroom`` call that hit the eviction path, never per
# evicted tile (per-tile detail goes to logs). ``trigger_tile_id`` is
# the canonical UUIDv5 of the tile whose write triggered the sweep;
# ``freed_bytes`` is the integer total reclaimed; ``evicted_count`` is
# the full count regardless of how many ids fit in
# ``evicted_tile_ids`` (capped to 5 to keep the record bounded —
# AC-11). The full eviction list is replayable from the per-tile
# ``c6.evicted`` INFO log records.
"c6.eviction_batch": frozenset(
{"trigger_tile_id", "freed_bytes", "evicted_count", "evicted_tile_ids"}
),
}
KNOWN_KINDS: Final[frozenset[str]] = frozenset(KNOWN_PAYLOAD_KEYS.keys())
@@ -71,14 +71,26 @@ def build_tile_store(config: Config) -> TileStore:
invoked via ``PostgresFilesystemStore.from_config(config)`` which
wires the ``ConnectionPool`` / ``FdrClient`` / logger / static
helper dependencies from the config block.
AZ-308: the returned :class:`TileStore` is wrapped in a
:class:`BudgetEnforcedTileStore` so every ``write_tile`` first
reserves head-room against the configured
``lru_eviction_threshold_bytes`` budget (RESTRICT-SAT-2). The
wrapper is transparent for read-side consumers.
"""
block = _c6_config(config)
runtime = block.store_runtime
if runtime == "postgres_filesystem":
try:
from gps_denied_onboard.components.c6_tile_cache.cache_budget_enforcer import (
BudgetEnforcedTileStore,
CacheBudgetEnforcer,
)
from gps_denied_onboard.components.c6_tile_cache.postgres_filesystem_store import (
PostgresFilesystemStore,
)
from gps_denied_onboard.fdr_client.client import make_fdr_client
from gps_denied_onboard.logging import get_logger
except ModuleNotFoundError as exc:
raise RuntimeNotAvailableError(
f"TileStore runtime {runtime!r} is configured but its "
@@ -86,7 +98,15 @@ def build_tile_store(config: Config) -> TileStore:
"'c6_tile_cache.postgres_filesystem_store' has not been "
"built into this binary yet (AZ-305 pending)."
) from exc
return PostgresFilesystemStore.from_config(config)
store = PostgresFilesystemStore.from_config(config)
enforcer = CacheBudgetEnforcer(
store=store,
fdr_client=make_fdr_client("c6_tile_cache.budget", config),
logger=get_logger("c6_tile_cache.budget"),
budget_bytes=block.lru_eviction_threshold_bytes,
eviction_batch_size=block.eviction_batch_size,
)
return BudgetEnforcedTileStore(wrapped=store, enforcer=enforcer)
raise RuntimeNotAvailableError(
f"TileStore runtime {runtime!r} is not buildable in this binary."
)