mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-21 08:11:09 +00:00
0854d3be1c
AZ-659: FramePublisher with per-consumer drop accounting (Arc<Bytes> zero-copy fan-out). Adds ConsumerId enum, PublisherStats, FrameReceiver wrapper, and publisher integration tests (AC-1, AC-2, AC-3). AZ-660: Bi-directional tonic gRPC stream to ../detections. Reconnect with bounded exponential backoff (1 s → 30 s cap). Drop-oldest in-flight budgeting (max_concurrent_in_flight = 2). ai_locked frame skipping. Integration tests against fixture in-process server (AC-1: happy path 30 fps/10 s, AC-2: reconnect, AC-3: budget drops, AC-4: ai_locked skipping). AZ-661: Schema validation (hard SchemaMismatch error on version mismatch), model_version latch with ModelVersionChanged events, sliding-window p99 latency tracker with Tier1Degraded/Tier1Recovered transitions. Integration tests (AC-1, AC-2, AC-3). Also: update module-layout.md for frame_ingest and detection_client to reflect the streaming API shape; code review report batch_18. Co-authored-by: Cursor <cursoragent@cursor.com>
291 lines
10 KiB
Rust
291 lines
10 KiB
Rust
//! `detection_client` — bi-directional gRPC client to `../detections`.
|
|
//!
|
|
//! AZ-660 wires the real `tonic` bi-directional stream + reconnect
|
|
//! state machine + drop-oldest frame budgeting. AZ-661 layers schema
|
|
//! validation, `model_version` tracking, and a sliding-window
|
|
//! latency degradation signal on top.
|
|
//!
|
|
//! ## Public surface
|
|
//!
|
|
//! - [`DetectionClient`] / [`DetectionClientConfig`] — configuration
|
|
//! and entry-point. Build a config, hand it to
|
|
//! [`DetectionClient::new`], then start the supervisor with
|
|
//! [`DetectionClient::run`].
|
|
//! - [`DetectionClientHandle`] — the cheap-clone handle returned
|
|
//! alongside the supervisor `JoinHandle`. Exposes the event stream,
|
|
//! health surface, connection state, and shutdown.
|
|
//! - [`DetectionEvent`] — the union type emitted on the event stream
|
|
//! (a `tokio::sync::broadcast` channel so multiple consumers may
|
|
//! observe). Covers normal detection batches plus AZ-661 schema
|
|
//! mismatches, model-version changes, and Tier-1 latency
|
|
//! degradation transitions.
|
|
//!
|
|
//! The supervisor task lives in [`internal::runtime`]. It is the
|
|
//! only owner of the gRPC channel; reconnects are bounded and the
|
|
//! frame-source side never blocks on a slow gRPC server (drop-oldest
|
|
//! budgeting per AC-3 of AZ-660).
|
|
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use tokio::sync::{broadcast, watch};
|
|
use tokio::task::JoinHandle;
|
|
|
|
use shared::health::{ComponentHealth, HealthLevel};
|
|
use shared::models::detection::DetectionBatch;
|
|
use shared::models::frame::Frame;
|
|
|
|
pub mod internal;
|
|
|
|
pub use internal::latency::DegradationTransition;
|
|
pub use internal::stats::DetectionStats;
|
|
|
|
const NAME: &str = "detection_client";
|
|
|
|
/// Configuration for [`DetectionClient`]. Defaults match the
|
|
/// `description.md §3` baseline (`max_concurrent_in_flight = 2`,
|
|
/// 100 ms p99 Tier-1 threshold, 1 s → 30 s reconnect backoff,
|
|
/// `expected_schema_version = 1`).
|
|
#[derive(Debug, Clone)]
|
|
pub struct DetectionClientConfig {
|
|
pub endpoint: String,
|
|
/// In-flight gRPC request budget. New frames evict the oldest
|
|
/// in-flight slot when this is reached (AC-3 of AZ-660).
|
|
pub max_concurrent_in_flight: usize,
|
|
pub connect_timeout: Duration,
|
|
pub reconnect_initial: Duration,
|
|
pub reconnect_cap: Duration,
|
|
/// Schema version the client was built against. Any response
|
|
/// with a different `schema_version` is a hard `SchemaMismatch`
|
|
/// (AC-1 of AZ-661).
|
|
pub expected_schema_version: u32,
|
|
/// Capacity of the outbound mpsc channel that feeds the gRPC
|
|
/// stream. Kept small so frames can't queue indefinitely on the
|
|
/// client side.
|
|
pub outbound_buffer: usize,
|
|
/// Capacity of the `events_tx` broadcast channel.
|
|
pub event_channel_capacity: usize,
|
|
/// Capacity of the sliding-window latency ring buffer (AZ-661).
|
|
pub latency_window_capacity: usize,
|
|
/// Tier-1 latency threshold (AC-3 of AZ-661). A `Tier1Degraded`
|
|
/// event is emitted when the sliding-window p99 crosses this
|
|
/// value; a `Tier1Recovered` event is emitted on the reverse
|
|
/// crossing.
|
|
pub latency_p99_threshold: Duration,
|
|
}
|
|
|
|
impl DetectionClientConfig {
|
|
pub fn new(endpoint: impl Into<String>) -> Self {
|
|
Self {
|
|
endpoint: endpoint.into(),
|
|
max_concurrent_in_flight: 2,
|
|
connect_timeout: Duration::from_secs(5),
|
|
reconnect_initial: Duration::from_secs(1),
|
|
reconnect_cap: Duration::from_secs(30),
|
|
expected_schema_version: 1,
|
|
outbound_buffer: 8,
|
|
event_channel_capacity: 64,
|
|
latency_window_capacity: 1024,
|
|
latency_p99_threshold: Duration::from_millis(100),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum ConnectionState {
|
|
Disconnected,
|
|
Connecting,
|
|
Connected,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub enum DetectionEvent {
|
|
/// Normal happy-path output. `capture_ts_monotonic_ns` is the
|
|
/// frame's monotonic timestamp at the moment `frame_ingest`
|
|
/// captured it (forwarded so downstream consumers can correlate
|
|
/// detections back to the original frame without re-querying
|
|
/// `frame_ingest`). `server_latency` is the server-reported
|
|
/// per-frame processing time.
|
|
Batch {
|
|
batch: DetectionBatch,
|
|
capture_ts_monotonic_ns: u64,
|
|
server_latency: Duration,
|
|
},
|
|
/// AZ-661 AC-1 — `schema_version` on a response did not match
|
|
/// `DetectionClientConfig::expected_schema_version`. The
|
|
/// response is REJECTED — no detections are forwarded for that
|
|
/// frame.
|
|
SchemaMismatch {
|
|
detail: String,
|
|
frame_seq: u64,
|
|
},
|
|
/// AZ-661 AC-2 — server reported a `model_version` different
|
|
/// from the last observed one. `previous` is `None` only on the
|
|
/// very first response in the process lifetime.
|
|
ModelVersionChanged {
|
|
previous: Option<String>,
|
|
current: String,
|
|
},
|
|
/// AZ-661 AC-3 — sliding-window p99 latency crossed the
|
|
/// configured threshold UPWARDS. The next degraded → healthy
|
|
/// crossing emits a paired [`DetectionEvent::Tier1Recovered`].
|
|
Tier1Degraded {
|
|
reason: Tier1DegradationReason,
|
|
},
|
|
Tier1Recovered,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum Tier1DegradationReason {
|
|
HighLatency,
|
|
}
|
|
|
|
/// Entry-point for the gRPC client. `new` is a builder; `run`
|
|
/// consumes the client and spawns the supervisor task that owns the
|
|
/// gRPC channel for the lifetime of the autopilot process.
|
|
#[derive(Debug)]
|
|
pub struct DetectionClient {
|
|
config: DetectionClientConfig,
|
|
}
|
|
|
|
impl DetectionClient {
|
|
pub fn new(config: DetectionClientConfig) -> Self {
|
|
Self { config }
|
|
}
|
|
|
|
/// Spawn the supervisor task. Returns the supervisor's
|
|
/// `JoinHandle<()>` and a cheap-clone [`DetectionClientHandle`]
|
|
/// that exposes the event stream, health surface, and
|
|
/// shutdown.
|
|
///
|
|
/// The supervisor owns `frame_rx` for its full lifetime.
|
|
/// `frame_rx` is a `tokio::sync::broadcast::Receiver<Frame>` —
|
|
/// the composition root is responsible for wiring it to
|
|
/// `frame_ingest::FrameIngestHandle::subscribe()` (raw) or to
|
|
/// a `FrameReceiver` forwarder if it wants per-consumer drop
|
|
/// attribution on the publisher side.
|
|
pub fn run(
|
|
self,
|
|
frame_rx: broadcast::Receiver<Frame>,
|
|
) -> (JoinHandle<()>, DetectionClientHandle) {
|
|
let (events_tx, _) = broadcast::channel(self.config.event_channel_capacity.max(1));
|
|
let (connection_tx, connection_rx) = watch::channel(ConnectionState::Disconnected);
|
|
let (shutdown_tx, shutdown_rx) = watch::channel(false);
|
|
let stats = DetectionStats::shared();
|
|
let latency = Arc::new(internal::latency::LatencyWindow::with_capacity(
|
|
self.config.latency_p99_threshold,
|
|
self.config.latency_window_capacity,
|
|
));
|
|
|
|
let join = internal::runtime::spawn_supervisor(
|
|
self.config.clone(),
|
|
frame_rx,
|
|
events_tx.clone(),
|
|
Arc::clone(&stats),
|
|
Arc::clone(&latency),
|
|
connection_tx,
|
|
shutdown_rx,
|
|
);
|
|
|
|
let handle = DetectionClientHandle {
|
|
stats,
|
|
latency,
|
|
connection_state_rx: connection_rx,
|
|
events_tx,
|
|
shutdown_tx,
|
|
};
|
|
|
|
(join, handle)
|
|
}
|
|
}
|
|
|
|
/// Cheap-clone handle for the `DetectionClient` supervisor. Exposes:
|
|
/// - Event subscription via [`Self::subscribe_events`].
|
|
/// - Connection-state watch via [`Self::connection_state`] /
|
|
/// [`Self::connection_state_stream`].
|
|
/// - Health surface (`description.md §3`) via [`Self::health`].
|
|
/// - Shutdown via [`Self::shutdown`] (idempotent).
|
|
#[derive(Debug, Clone)]
|
|
pub struct DetectionClientHandle {
|
|
stats: Arc<DetectionStats>,
|
|
latency: Arc<internal::latency::LatencyWindow>,
|
|
connection_state_rx: watch::Receiver<ConnectionState>,
|
|
events_tx: broadcast::Sender<DetectionEvent>,
|
|
shutdown_tx: watch::Sender<bool>,
|
|
}
|
|
|
|
impl DetectionClientHandle {
|
|
/// Subscribe to the [`DetectionEvent`] stream. The broadcast
|
|
/// channel applies its own drop-oldest back-pressure to slow
|
|
/// consumers; new subscribers see events emitted after they
|
|
/// subscribed.
|
|
pub fn subscribe_events(&self) -> broadcast::Receiver<DetectionEvent> {
|
|
self.events_tx.subscribe()
|
|
}
|
|
|
|
pub fn connection_state(&self) -> ConnectionState {
|
|
*self.connection_state_rx.borrow()
|
|
}
|
|
|
|
pub fn connection_state_stream(&self) -> watch::Receiver<ConnectionState> {
|
|
self.connection_state_rx.clone()
|
|
}
|
|
|
|
pub fn stats(&self) -> Arc<DetectionStats> {
|
|
Arc::clone(&self.stats)
|
|
}
|
|
|
|
pub fn latency_p50(&self) -> Option<Duration> {
|
|
self.latency.p50()
|
|
}
|
|
|
|
pub fn latency_p99(&self) -> Option<Duration> {
|
|
self.latency.p99()
|
|
}
|
|
|
|
pub fn shutdown(&self) {
|
|
self.shutdown_tx.send_replace(true);
|
|
}
|
|
|
|
pub fn health(&self) -> ComponentHealth {
|
|
let state = self.connection_state();
|
|
match state {
|
|
ConnectionState::Disconnected => ComponentHealth::red(NAME, "disconnected"),
|
|
ConnectionState::Connecting => ComponentHealth::yellow(NAME, "connecting"),
|
|
ConnectionState::Connected => {
|
|
// `description.md §3` — p99 above threshold is the
|
|
// operative health signal once we're connected.
|
|
let mut h = ComponentHealth::green(NAME);
|
|
if let Some(p99) = self.latency.p99() {
|
|
if p99 > self.latency.threshold() {
|
|
h.level = HealthLevel::Yellow;
|
|
h.detail = Some(format!(
|
|
"p99 {} ms > threshold {} ms",
|
|
p99.as_millis(),
|
|
self.latency.threshold().as_millis()
|
|
));
|
|
}
|
|
}
|
|
h
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn config_defaults_match_description() {
|
|
// Arrange
|
|
let c = DetectionClientConfig::new("http://127.0.0.1:50051");
|
|
|
|
// Assert — the §3 baseline numbers.
|
|
assert_eq!(c.max_concurrent_in_flight, 2);
|
|
assert_eq!(c.reconnect_cap, Duration::from_secs(30));
|
|
assert_eq!(c.expected_schema_version, 1);
|
|
assert_eq!(c.latency_p99_threshold, Duration::from_millis(100));
|
|
}
|
|
}
|