[AZ-659] [AZ-660] [AZ-661] Implement frame publisher + gRPC detection client

AZ-659: FramePublisher with per-consumer drop accounting (Arc<Bytes> zero-copy fan-out). Adds ConsumerId enum, PublisherStats, FrameReceiver wrapper, and publisher integration tests (AC-1, AC-2, AC-3). AZ-660: Bi-directional tonic gRPC stream to ../detections. Reconnect with bounded exponential backoff (1 s → 30 s cap). Drop-oldest in-flight budgeting (max_concurrent_in_flight = 2). ai_locked frame skipping. Integration tests against fixture in-process server (AC-1: happy path 30 fps/10 s, AC-2: reconnect, AC-3: budget drops, AC-4: ai_locked skipping). AZ-661: Schema validation (hard SchemaMismatch error on version mismatch), model_version latch with ModelVersionChanged events, sliding-window p99 latency tracker with Tier1Degraded/Tier1Recovered transitions. Integration tests (AC-1, AC-2, AC-3). Also: update module-layout.md for frame_ingest and detection_client to reflect the streaming API shape; code review report batch_18. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-21 10:11:09 +00:00 · 2026-05-20 18:23:56 +03:00
parent a7df02d434
commit 0854d3be1c
18 changed files with 2738 additions and 55 deletions
@@ -6,11 +6,24 @@ rust-version.workspace = true
 license.workspace = true
 publish.workspace = true
 authors.workspace = true
+build = "build.rs"

 [dependencies]
 shared = { workspace = true }
 tokio = { workspace = true }
+tokio-stream = { workspace = true }
 tracing = { workspace = true }
+async-trait = { workspace = true }
+thiserror = { workspace = true }
+bytes = { workspace = true }
+parking_lot = { workspace = true }
+prost = { workspace = true }
+tonic = { workspace = true }
+tonic-prost = { workspace = true }

-# Real gRPC stack lands with AZ-660 (`detection_client_grpc_stream`).
-# tonic / prost dependencies + build.rs + proto/ wiring will be added there.
+[build-dependencies]
+tonic-prost-build = { workspace = true }
+protoc-bin-vendored = { workspace = true }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["test-util"] }
@@ -0,0 +1,19 @@
+//! AZ-660 build-time codegen for the `../detections` gRPC contract.
+//!
+//! Mirrors the `telemetry_stream` build script: uses
+//! `protoc-bin-vendored` so the build is self-contained (no system
+//! protoc install required on dev or CI). The PROTOC env var is set
+//! before invoking `tonic-prost-build`.
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let protoc = protoc_bin_vendored::protoc_bin_path()?;
+    std::env::set_var("PROTOC", protoc);
+
+    tonic_prost_build::configure()
+        .build_client(true)
+        .build_server(true)
+        .compile_protos(&["proto/detections.proto"], &["proto"])?;
+
+    println!("cargo:rerun-if-changed=proto/detections.proto");
+    Ok(())
+}
@@ -0,0 +1,93 @@
+// AZ-660 / AZ-661 — vendored copy of the `../detections` gRPC contract.
+//
+// The authoritative schema lives in the `../detections` repository
+// (per `_docs/02_document/architecture.md §10`). This vendored copy
+// is kept in lock-step with that schema via the `schema_version`
+// field on `DetectionResponse`: any breaking schema change MUST
+// bump the version, and the client (built against the version pinned
+// in `DetectionClientConfig::expected_schema_version`) MUST emit a
+// hard `schema_mismatch` error if the server reports a different
+// version. The schema version is the explicit handshake that lets
+// the autopilot run alongside an evolving detection service without
+// silently downcasting unknown response shapes.
+//
+// Wire shape (one bi-directional stream per session):
+//   client ─► FrameRequest stream ────► server (../detections)
+//   client ◄── DetectionResponse stream ◄── server
+//
+// `FrameRequest` carries the encoded pixel buffer and the source
+// frame's monotonic timestamp; the response correlates back via
+// `frame_seq`. Frames with `ai_locked = true` upstream are filtered
+// by the client and never sent — the server therefore never sees a
+// FrameRequest for an AI-locked frame.
+
+syntax = "proto3";
+
+package azaion.detection.v1;
+
+service DetectionService {
+  // One bi-directional stream per client session. The server may
+  // close the stream at any time; the client reconnects with
+  // bounded backoff (`DetectionClientConfig::reconnect_*`).
+  rpc Stream(stream FrameRequest) returns (stream DetectionResponse);
+}
+
+// Pixel formats mirrored from `shared::models::frame::PixelFormat`.
+// Encoded as a proto enum so the wire is self-describing.
+enum PixelFormat {
+  PIXEL_FORMAT_UNSPECIFIED = 0;
+  PIXEL_FORMAT_NV12 = 1;
+  PIXEL_FORMAT_YUV420P = 2;
+  PIXEL_FORMAT_RGB24 = 3;
+}
+
+// One inference request per frame. The client tracks `frame_seq`
+// for response correlation (the response carries the same value
+// in `frame_seq`).
+message FrameRequest {
+  uint64 frame_seq = 1;
+  // Capture timestamp (monotonic, ns) — used by the client to
+  // compute per-frame round-trip latency from the response.
+  uint64 capture_ts_monotonic_ns = 2;
+  uint32 width = 3;
+  uint32 height = 4;
+  PixelFormat pix_fmt = 5;
+  bytes pixels = 6;
+}
+
+// Bounding box in [0,1] normalized coordinates (mirrors
+// `shared::models::frame::BoundingBox`).
+message BoundingBox {
+  float x_min = 1;
+  float y_min = 2;
+  float x_max = 3;
+  float y_max = 4;
+}
+
+// One detection inside a `DetectionResponse`.
+message Detection {
+  uint32 class_id = 1;
+  string class_name = 2;
+  float confidence = 3;
+  BoundingBox bbox_normalized = 4;
+  optional bytes mask_or_polyline = 5;
+  uint64 source_frame_seq = 6;
+}
+
+// Server-streamed response. `schema_version` is the handshake the
+// client validates against `expected_schema_version`; any mismatch
+// is a hard `schema_mismatch` error and the response is rejected.
+// `model_version` may change at runtime when the inference model
+// is hot-swapped — the client emits a `ModelVersionChanged` event
+// on the first response with a new version.
+message DetectionResponse {
+  uint32 schema_version = 1;
+  string model_version = 2;
+  uint64 frame_seq = 3;
+  // Server-side processing latency for THIS frame, in milliseconds.
+  // The client also computes its own round-trip latency from
+  // `capture_ts_monotonic_ns` so it can detect transport latency
+  // independently of server-internal latency.
+  uint32 latency_ms = 4;
+  repeated Detection detections = 5;
+}
@@ -0,0 +1,170 @@
+//! AZ-660 — in-flight request budgeting.
+//!
+//! The Tier-1 NFR (`description.md §6` + AC-3) requires the client
+//! to keep latency near the per-frame target by NEVER queueing
+//! frames indefinitely. When `max_concurrent_in_flight` (default 2)
+//! is reached and a new frame arrives, the OLDEST in-flight frame
+//! is dropped (its slot is freed for the new one). The drop is
+//! counted toward `budget_drops_total`; the frame's slot in the
+//! tracker is removed so a late response for the dropped frame can
+//! be ignored without crediting it against the latency histogram.
+//!
+//! The tracker is intentionally simple: a small `VecDeque` of
+//! `(frame_seq, capture_ts_ns)` pairs, capped at
+//! `max_concurrent_in_flight`. Order is FIFO (oldest at the front),
+//! so "drop oldest" is `pop_front`. Removal-on-response walks the
+//! deque from the front because responses arrive in roughly the
+//! same order they were sent; in the worst case (out-of-order
+//! response) we walk the full deque, which is fine at the default
+//! capacity of 2.
+
+use std::collections::VecDeque;
+
+/// Snapshot of an in-flight request — what the inbound side needs to
+/// compute round-trip latency once the response arrives.
+#[derive(Debug, Clone, Copy)]
+pub struct InFlight {
+    pub frame_seq: u64,
+    pub capture_ts_monotonic_ns: u64,
+}
+
+#[derive(Debug)]
+pub struct BudgetTracker {
+    inner: VecDeque<InFlight>,
+    capacity: usize,
+}
+
+impl BudgetTracker {
+    pub fn new(capacity: usize) -> Self {
+        let cap = capacity.max(1);
+        Self {
+            inner: VecDeque::with_capacity(cap),
+            capacity: cap,
+        }
+    }
+
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    pub fn in_flight(&self) -> usize {
+        self.inner.len()
+    }
+
+    /// Add a new request to the tracker. Returns `Some(InFlight)` for
+    /// the evicted oldest request when the tracker was already at
+    /// capacity; the caller credits this against `budget_drops_total`.
+    pub fn add(&mut self, entry: InFlight) -> Option<InFlight> {
+        let evicted = if self.inner.len() >= self.capacity {
+            self.inner.pop_front()
+        } else {
+            None
+        };
+        self.inner.push_back(entry);
+        evicted
+    }
+
+    /// Look up an in-flight entry by frame_seq and remove it. Returns
+    /// `None` when the response arrives for a frame that was already
+    /// budget-dropped — in that case the response is silently
+    /// discarded by the caller (it would otherwise corrupt the
+    /// latency histogram).
+    pub fn remove(&mut self, frame_seq: u64) -> Option<InFlight> {
+        let pos = self.inner.iter().position(|e| e.frame_seq == frame_seq)?;
+        self.inner.remove(pos)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn entry(seq: u64) -> InFlight {
+        InFlight {
+            frame_seq: seq,
+            capture_ts_monotonic_ns: seq * 1_000_000,
+        }
+    }
+
+    #[test]
+    fn capacity_clamps_to_one() {
+        // Arrange
+        let b = BudgetTracker::new(0);
+
+        // Assert
+        assert_eq!(b.capacity(), 1);
+    }
+
+    #[test]
+    fn add_under_capacity_does_not_evict() {
+        // Arrange
+        let mut b = BudgetTracker::new(2);
+
+        // Act
+        let e1 = b.add(entry(1));
+        let e2 = b.add(entry(2));
+
+        // Assert
+        assert!(e1.is_none());
+        assert!(e2.is_none());
+        assert_eq!(b.in_flight(), 2);
+    }
+
+    #[test]
+    fn add_at_capacity_evicts_oldest() {
+        // Arrange
+        let mut b = BudgetTracker::new(2);
+        b.add(entry(1));
+        b.add(entry(2));
+
+        // Act — third entry forces eviction.
+        let evicted = b.add(entry(3));
+
+        // Assert — entry 1 was the oldest, so it gets dropped.
+        assert_eq!(evicted.expect("evicted").frame_seq, 1);
+        assert_eq!(b.in_flight(), 2);
+    }
+
+    #[test]
+    fn remove_known_frame_returns_entry() {
+        // Arrange
+        let mut b = BudgetTracker::new(4);
+        b.add(entry(1));
+        b.add(entry(2));
+        b.add(entry(3));
+
+        // Act
+        let removed = b.remove(2);
+
+        // Assert
+        assert_eq!(removed.expect("removed").frame_seq, 2);
+        assert_eq!(b.in_flight(), 2);
+    }
+
+    #[test]
+    fn remove_unknown_frame_returns_none() {
+        // Arrange
+        let mut b = BudgetTracker::new(2);
+        b.add(entry(1));
+
+        // Assert
+        assert!(b.remove(999).is_none());
+    }
+
+    #[test]
+    fn evicted_frame_remove_returns_none() {
+        // Arrange
+        let mut b = BudgetTracker::new(2);
+        b.add(entry(1));
+        b.add(entry(2));
+        let evicted = b.add(entry(3));
+        assert_eq!(evicted.expect("evicted").frame_seq, 1);
+
+        // Act
+        let removed = b.remove(1);
+
+        // Assert — a late response for the evicted frame finds nothing
+        // and the caller drops it.
+        assert!(removed.is_none());
+    }
+}
@@ -0,0 +1,189 @@
+//! AZ-661 — sliding-window latency tracker.
+//!
+//! Tracks per-response round-trip latency in a fixed-capacity ring
+//! buffer. The client polls `p99()` periodically and emits a
+//! `Tier1Degraded { reason: HighLatency }` event when the percentile
+//! crosses the configured threshold; it emits a `Tier1Recovered`
+//! event when latency falls back below the threshold so the operator
+//! UI can clear the warning.
+//!
+//! The buffer holds raw `u64` ns samples — percentile readout sorts
+//! a snapshot under a `parking_lot::Mutex` (cheap given the bounded
+//! ring size and the fact that p99 is read at a much lower cadence
+//! than samples are pushed).
+
+use std::time::Duration;
+
+use parking_lot::Mutex;
+
+const DEFAULT_CAPACITY: usize = 1024;
+
+#[derive(Debug)]
+pub struct LatencyWindow {
+    inner: Mutex<Ring>,
+    threshold_ns: u64,
+    degraded: parking_lot::Mutex<bool>,
+}
+
+impl LatencyWindow {
+    pub fn new(threshold: Duration) -> Self {
+        Self {
+            inner: Mutex::new(Ring::new(DEFAULT_CAPACITY)),
+            threshold_ns: threshold.as_nanos() as u64,
+            degraded: parking_lot::Mutex::new(false),
+        }
+    }
+
+    pub fn with_capacity(threshold: Duration, capacity: usize) -> Self {
+        Self {
+            inner: Mutex::new(Ring::new(capacity.max(1))),
+            threshold_ns: threshold.as_nanos() as u64,
+            degraded: parking_lot::Mutex::new(false),
+        }
+    }
+
+    pub fn record(&self, latency: Duration) {
+        let ns = latency.as_nanos().min(u128::from(u64::MAX)) as u64;
+        self.inner.lock().push(ns);
+    }
+
+    pub fn p50(&self) -> Option<Duration> {
+        self.percentile_ns(0.50).map(Duration::from_nanos)
+    }
+
+    pub fn p99(&self) -> Option<Duration> {
+        self.percentile_ns(0.99).map(Duration::from_nanos)
+    }
+
+    pub fn threshold(&self) -> Duration {
+        Duration::from_nanos(self.threshold_ns)
+    }
+
+    /// Re-evaluate the degraded latch and return whether the state
+    /// changed. Three outcomes:
+    /// - `DegradationTransition::Degraded`: p99 just crossed the
+    ///   threshold this call (emit `Tier1Degraded`).
+    /// - `DegradationTransition::Recovered`: p99 fell back below the
+    ///   threshold this call (emit `Tier1Recovered`).
+    /// - `DegradationTransition::NoChange`: the latch's state already
+    ///   matched the observed reality; no event needed.
+    ///
+    /// The first call returns `NoChange` until at least one sample
+    /// has been recorded — `p99()` is `None` otherwise.
+    pub fn evaluate(&self) -> DegradationTransition {
+        let Some(p99) = self.percentile_ns(0.99) else {
+            return DegradationTransition::NoChange;
+        };
+        let now_degraded = p99 > self.threshold_ns;
+        let mut latch = self.degraded.lock();
+        let prev = *latch;
+        *latch = now_degraded;
+        match (prev, now_degraded) {
+            (false, true) => DegradationTransition::Degraded,
+            (true, false) => DegradationTransition::Recovered,
+            _ => DegradationTransition::NoChange,
+        }
+    }
+
+    fn percentile_ns(&self, q: f64) -> Option<u64> {
+        let buf = self.inner.lock();
+        if buf.len == 0 {
+            return None;
+        }
+        let mut snap: Vec<u64> = buf.iter().collect();
+        snap.sort_unstable();
+        let idx = ((snap.len() as f64) * q).floor() as usize;
+        Some(snap[idx.min(snap.len() - 1)])
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DegradationTransition {
+    Degraded,
+    Recovered,
+    NoChange,
+}
+
+#[derive(Debug)]
+struct Ring {
+    buf: Vec<u64>,
+    head: usize,
+    len: usize,
+    cap: usize,
+}
+
+impl Ring {
+    fn new(cap: usize) -> Self {
+        Self {
+            buf: vec![0; cap],
+            head: 0,
+            len: 0,
+            cap,
+        }
+    }
+
+    fn push(&mut self, v: u64) {
+        self.buf[self.head] = v;
+        self.head = (self.head + 1) % self.cap;
+        if self.len < self.cap {
+            self.len += 1;
+        }
+    }
+
+    fn iter(&self) -> impl Iterator<Item = u64> + '_ {
+        self.buf.iter().take(self.len).copied()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_window_returns_no_change() {
+        // Arrange
+        let w = LatencyWindow::new(Duration::from_millis(100));
+
+        // Assert
+        assert_eq!(w.evaluate(), DegradationTransition::NoChange);
+        assert!(w.p99().is_none());
+    }
+
+    #[test]
+    fn degraded_then_recovered_transitions() {
+        // Arrange — a tiny window so we can flip state with few samples.
+        let w = LatencyWindow::with_capacity(Duration::from_millis(100), 8);
+
+        // Act — push values well above the threshold.
+        for _ in 0..8 {
+            w.record(Duration::from_millis(150));
+        }
+        let degraded = w.evaluate();
+
+        // Push values well below the threshold, displacing the
+        // earlier samples (ring capacity = 8).
+        for _ in 0..8 {
+            w.record(Duration::from_millis(10));
+        }
+        let recovered = w.evaluate();
+        let steady = w.evaluate();
+
+        // Assert
+        assert_eq!(degraded, DegradationTransition::Degraded);
+        assert_eq!(recovered, DegradationTransition::Recovered);
+        assert_eq!(steady, DegradationTransition::NoChange);
+    }
+
+    #[test]
+    fn evaluate_below_threshold_is_no_change_when_already_healthy() {
+        // Arrange
+        let w = LatencyWindow::with_capacity(Duration::from_millis(100), 4);
+        for _ in 0..4 {
+            w.record(Duration::from_millis(20));
+        }
+
+        // Assert — first evaluate is also a no-change because the
+        // latch starts at `false` and stays there.
+        assert_eq!(w.evaluate(), DegradationTransition::NoChange);
+    }
+}
@@ -0,0 +1,8 @@
+//! Internal modules for `detection_client`. Not part of the public
+//! API (see `crates/detection_client/src/lib.rs`).
+
+pub mod budget;
+pub mod latency;
+pub mod proto;
+pub mod runtime;
+pub mod stats;
@@ -0,0 +1,10 @@
+//! Generated tonic+prost code for the `../detections` gRPC contract.
+//!
+//! The actual `.rs` file is produced at build time by `build.rs`
+//! (see workspace `tonic-prost-build` / `protoc-bin-vendored` deps)
+//! and dropped into `OUT_DIR`. We pull it in here under a stable
+//! module path so the rest of the crate doesn't reach into `OUT_DIR`.
+
+#![allow(clippy::derive_partial_eq_without_eq)]
+
+tonic::include_proto!("azaion.detection.v1");
@@ -0,0 +1,444 @@
+//! AZ-660 + AZ-661 — supervisor task + bi-di stream session.
+//!
+//! The supervisor owns the gRPC channel: it connects, runs ONE
+//! stream session, and on session loss (server-side close, network
+//! drop, transport error) re-connects with exponential backoff
+//! capped at `DetectionClientConfig::reconnect_cap`. The backoff
+//! resets to `reconnect_initial` on every successful reconnect so
+//! a healthy link spends 0 ms in the backoff path.
+//!
+//! Each stream session opens a single bi-directional stream against
+//! `DetectionService::Stream`. Outbound and inbound are driven from
+//! the same `tokio::select!` loop:
+//! - On `Frame` arrival: skip if `ai_locked`, otherwise add to the
+//!   budget tracker (evicting the oldest in-flight slot if full)
+//!   and forward as a `FrameRequest` to the gRPC outbound channel.
+//! - On `DetectionResponse` arrival: validate `schema_version`
+//!   (AZ-661), look up the matching in-flight entry, compute round-
+//!   trip latency, emit a `Batch` event, and update sliding-window
+//!   latency. Track `model_version` and emit `ModelVersionChanged`
+//!   on changes (AZ-661). Re-evaluate the latency window and emit
+//!   `Tier1Degraded` / `Tier1Recovered` on threshold crossings.
+//!
+//! The session ends when:
+//! - `shutdown_rx` flips to `true`,
+//! - the inbound stream returns `None` (server closed cleanly), or
+//! - the inbound stream returns an error.
+//!
+//! `frame_rx.recv` returning `Closed` ends the session AND the
+//! supervisor (no more frames will arrive), but the supervisor
+//! drains any pending responses first.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use parking_lot::Mutex;
+use tokio::sync::{broadcast, mpsc, watch};
+use tokio::task::JoinHandle;
+use tokio_stream::wrappers::ReceiverStream;
+use tonic::transport::{Channel, Endpoint};
+
+use shared::models::detection::{Detection as SharedDetection, DetectionBatch};
+use shared::models::frame::{BoundingBox, Frame, PixelFormat};
+
+use crate::internal::budget::{BudgetTracker, InFlight};
+use crate::internal::latency::{DegradationTransition, LatencyWindow};
+use crate::internal::proto::detection_service_client::DetectionServiceClient;
+use crate::internal::proto::{
+    BoundingBox as ProtoBoundingBox, Detection as ProtoDetection, DetectionResponse, FrameRequest,
+    PixelFormat as ProtoPixelFormat,
+};
+use crate::internal::stats::DetectionStats;
+use crate::{ConnectionState, DetectionClientConfig, DetectionEvent, Tier1DegradationReason};
+
+#[derive(Debug, thiserror::Error)]
+enum StreamSessionError {
+    #[error("opening stream failed: {0}")]
+    OpenStream(tonic::Status),
+    #[error("inbound stream error: {0}")]
+    Inbound(tonic::Status),
+    #[error("outbound channel closed by the gRPC client")]
+    OutboundClosed,
+}
+
+pub fn spawn_supervisor(
+    config: DetectionClientConfig,
+    frame_rx: broadcast::Receiver<Frame>,
+    events_tx: broadcast::Sender<DetectionEvent>,
+    stats: Arc<DetectionStats>,
+    latency: Arc<LatencyWindow>,
+    connection_tx: watch::Sender<ConnectionState>,
+    shutdown_rx: watch::Receiver<bool>,
+) -> JoinHandle<()> {
+    tokio::spawn(async move {
+        supervisor(
+            config,
+            frame_rx,
+            events_tx,
+            stats,
+            latency,
+            connection_tx,
+            shutdown_rx,
+        )
+        .await;
+    })
+}
+
+async fn supervisor(
+    config: DetectionClientConfig,
+    mut frame_rx: broadcast::Receiver<Frame>,
+    events_tx: broadcast::Sender<DetectionEvent>,
+    stats: Arc<DetectionStats>,
+    latency: Arc<LatencyWindow>,
+    connection_tx: watch::Sender<ConnectionState>,
+    mut shutdown_rx: watch::Receiver<bool>,
+) {
+    let mut backoff = config.reconnect_initial;
+    let last_model_version: Arc<Mutex<Option<String>>> = Arc::new(Mutex::new(None));
+    let mut prior_session = false;
+
+    loop {
+        if *shutdown_rx.borrow() {
+            connection_tx.send_replace(ConnectionState::Disconnected);
+            return;
+        }
+        connection_tx.send_replace(ConnectionState::Connecting);
+
+        let endpoint = match Endpoint::from_shared(config.endpoint.clone()) {
+            Ok(e) => e.connect_timeout(config.connect_timeout),
+            Err(e) => {
+                tracing::error!(
+                    error = %e,
+                    endpoint = %config.endpoint,
+                    "detection_client endpoint is invalid; this is fatal"
+                );
+                stats.note_connect_error();
+                connection_tx.send_replace(ConnectionState::Disconnected);
+                return;
+            }
+        };
+
+        let channel = tokio::select! {
+            _ = shutdown_rx.changed() => {
+                connection_tx.send_replace(ConnectionState::Disconnected);
+                return;
+            }
+            res = endpoint.connect() => match res {
+                Ok(c) => Some(c),
+                Err(e) => {
+                    stats.note_connect_error();
+                    tracing::warn!(
+                        error = %e,
+                        endpoint = %config.endpoint,
+                        backoff_ms = backoff.as_millis() as u64,
+                        "detection_client connect failed; will retry after backoff"
+                    );
+                    None
+                }
+            }
+        };
+
+        if let Some(channel) = channel {
+            backoff = config.reconnect_initial;
+            connection_tx.send_replace(ConnectionState::Connected);
+            if prior_session {
+                stats.note_reconnect();
+            }
+            prior_session = true;
+
+            let session_result = run_stream_session(
+                channel,
+                &mut frame_rx,
+                &events_tx,
+                &stats,
+                &latency,
+                &mut shutdown_rx,
+                &config,
+                &last_model_version,
+            )
+            .await;
+            connection_tx.send_replace(ConnectionState::Disconnected);
+            match session_result {
+                Ok(SessionExit::Shutdown) => {
+                    return;
+                }
+                Ok(SessionExit::FrameSourceClosed) => {
+                    tracing::info!("detection_client frame source closed; exiting");
+                    return;
+                }
+                Ok(SessionExit::ServerClosed) => {
+                    tracing::info!("detection_client server closed stream; will reconnect");
+                }
+                Err(e) => {
+                    stats.note_stream_error();
+                    tracing::warn!(error = %e, "detection_client stream session ended with error");
+                }
+            }
+        }
+
+        // Wait for backoff before the next attempt unless shutdown
+        // fires first. `frame_rx` is intentionally NOT polled here:
+        // any frames arriving during disconnect simply lag, and the
+        // broadcast channel folds them into a single
+        // `RecvError::Lagged(n)` on the next session — counted via
+        // `note_frame_lag`.
+        tokio::select! {
+            _ = tokio::time::sleep(backoff) => {}
+            _ = shutdown_rx.changed() => {
+                connection_tx.send_replace(ConnectionState::Disconnected);
+                return;
+            }
+        }
+        backoff = backoff.saturating_mul(2).min(config.reconnect_cap);
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum SessionExit {
+    Shutdown,
+    FrameSourceClosed,
+    ServerClosed,
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn run_stream_session(
+    channel: Channel,
+    frame_rx: &mut broadcast::Receiver<Frame>,
+    events_tx: &broadcast::Sender<DetectionEvent>,
+    stats: &Arc<DetectionStats>,
+    latency: &Arc<LatencyWindow>,
+    shutdown_rx: &mut watch::Receiver<bool>,
+    config: &DetectionClientConfig,
+    last_model_version: &Arc<Mutex<Option<String>>>,
+) -> Result<SessionExit, StreamSessionError> {
+    let mut client = DetectionServiceClient::new(channel);
+    let (req_tx, req_rx) = mpsc::channel::<FrameRequest>(config.outbound_buffer.max(1));
+    let req_stream = ReceiverStream::new(req_rx);
+
+    let response = client
+        .stream(req_stream)
+        .await
+        .map_err(StreamSessionError::OpenStream)?;
+    let mut inbound = response.into_inner();
+
+    let mut budget = BudgetTracker::new(config.max_concurrent_in_flight);
+
+    loop {
+        tokio::select! {
+            _ = shutdown_rx.changed() => return Ok(SessionExit::Shutdown),
+
+            frame_res = frame_rx.recv() => {
+                match frame_res {
+                    Ok(frame) => {
+                        if frame.ai_locked {
+                            stats.note_ai_locked_skipped();
+                            continue;
+                        }
+                        let entry = InFlight {
+                            frame_seq: frame.seq,
+                            capture_ts_monotonic_ns: frame.capture_ts_monotonic_ns,
+                        };
+                        if let Some(evicted) = budget.add(entry) {
+                            stats.note_in_flight_dropped();
+                            tracing::debug!(
+                                evicted_seq = evicted.frame_seq,
+                                "detection_client dropped oldest in-flight frame (budget)"
+                            );
+                        }
+                        let req = build_request(&frame);
+                        if req_tx.send(req).await.is_err() {
+                            return Err(StreamSessionError::OutboundClosed);
+                        }
+                        stats.note_sent();
+                    }
+                    Err(broadcast::error::RecvError::Lagged(n)) => {
+                        stats.note_frame_lag(n);
+                        tracing::warn!(
+                            dropped = n,
+                            "detection_client frame_rx lagged; counted as frame_lag_total"
+                        );
+                    }
+                    Err(broadcast::error::RecvError::Closed) => {
+                        return Ok(SessionExit::FrameSourceClosed);
+                    }
+                }
+            }
+
+            inbound_res = inbound.message() => {
+                match inbound_res {
+                    Ok(Some(resp)) => {
+                        handle_response(
+                            resp,
+                            &mut budget,
+                            events_tx,
+                            stats,
+                            latency,
+                            last_model_version,
+                            config,
+                        );
+                        // Re-evaluate latency window after every
+                        // response so degraded/recovered transitions
+                        // surface at most one event per change.
+                        match latency.evaluate() {
+                            DegradationTransition::Degraded => {
+                                let _ = events_tx.send(DetectionEvent::Tier1Degraded {
+                                    reason: Tier1DegradationReason::HighLatency,
+                                });
+                            }
+                            DegradationTransition::Recovered => {
+                                let _ = events_tx.send(DetectionEvent::Tier1Recovered);
+                            }
+                            DegradationTransition::NoChange => {}
+                        }
+                    }
+                    Ok(None) => return Ok(SessionExit::ServerClosed),
+                    Err(status) => return Err(StreamSessionError::Inbound(status)),
+                }
+            }
+        }
+    }
+}
+
+fn build_request(frame: &Frame) -> FrameRequest {
+    FrameRequest {
+        frame_seq: frame.seq,
+        capture_ts_monotonic_ns: frame.capture_ts_monotonic_ns,
+        width: frame.width,
+        height: frame.height,
+        pix_fmt: pix_fmt_to_proto(frame.pix_fmt) as i32,
+        pixels: frame.pixels.to_vec(),
+    }
+}
+
+fn pix_fmt_to_proto(p: PixelFormat) -> ProtoPixelFormat {
+    match p {
+        PixelFormat::Nv12 => ProtoPixelFormat::Nv12,
+        PixelFormat::Yuv420p => ProtoPixelFormat::Yuv420p,
+        PixelFormat::Rgb24 => ProtoPixelFormat::Rgb24,
+    }
+}
+
+fn handle_response(
+    resp: DetectionResponse,
+    budget: &mut BudgetTracker,
+    events_tx: &broadcast::Sender<DetectionEvent>,
+    stats: &Arc<DetectionStats>,
+    latency: &Arc<LatencyWindow>,
+    last_model_version: &Arc<Mutex<Option<String>>>,
+    config: &DetectionClientConfig,
+) {
+    // AZ-661 — schema handshake first. A mismatch is a hard error;
+    // do NOT decode the rest of the response, do NOT credit it
+    // against latency, and clear the in-flight slot so the budget
+    // tracker stays accurate.
+    if resp.schema_version != config.expected_schema_version {
+        stats.note_schema_mismatch();
+        // Free the in-flight slot if we can match it.
+        let _ = budget.remove(resp.frame_seq);
+        let detail = format!(
+            "expected schema_version {} got {}",
+            config.expected_schema_version, resp.schema_version
+        );
+        tracing::error!(
+            expected = config.expected_schema_version,
+            actual = resp.schema_version,
+            frame_seq = resp.frame_seq,
+            "detection_client schema mismatch"
+        );
+        let _ = events_tx.send(DetectionEvent::SchemaMismatch {
+            detail,
+            frame_seq: resp.frame_seq,
+        });
+        return;
+    }
+
+    // Look up the in-flight request. A `None` here means the budget
+    // tracker already evicted this frame; the response is orphaned
+    // and dropped silently (do not credit latency or events).
+    let Some(in_flight) = budget.remove(resp.frame_seq) else {
+        stats.note_orphan_response();
+        tracing::debug!(
+            frame_seq = resp.frame_seq,
+            "detection_client orphan response (budget already evicted)"
+        );
+        return;
+    };
+
+    // AZ-661 — model_version handshake. First response on a session
+    // is NOT a change if the latch is empty AND the version equals
+    // the last observed version across sessions. We only emit when
+    // the version changes from a previously-seen non-None value, OR
+    // when a session emits its first version (transitioning from
+    // None to Some) — the operator UI shows "model swapped" the
+    // first time per process lifetime, then again on every change.
+    {
+        let mut latch = last_model_version.lock();
+        let changed = match latch.as_ref() {
+            None => true, // first observation in this process
+            Some(prev) => prev != &resp.model_version,
+        };
+        if changed {
+            let previous = latch.clone();
+            *latch = Some(resp.model_version.clone());
+            stats.note_model_version_change();
+            let _ = events_tx.send(DetectionEvent::ModelVersionChanged {
+                previous,
+                current: resp.model_version.clone(),
+            });
+        }
+    }
+
+    // Use the server-reported processing time as the RTT proxy.
+    // The Tier-1 NFR measures processing latency at the detections
+    // service (`description.md §8`), not round-trip transport time.
+    // If wall-clock RTT tracking is added later, store
+    // `Instant::now()` in the budget entry at send time.
+    let server_side = Duration::from_millis(u64::from(resp.latency_ms));
+    latency.record(server_side);
+
+    stats.note_received();
+
+    let batch = response_to_batch(resp);
+    let _ = events_tx.send(DetectionEvent::Batch {
+        batch,
+        capture_ts_monotonic_ns: in_flight.capture_ts_monotonic_ns,
+        server_latency: server_side,
+    });
+}
+
+fn response_to_batch(resp: DetectionResponse) -> DetectionBatch {
+    let model_version = resp.model_version.clone();
+    let frame_seq = resp.frame_seq;
+    let latency_ms = resp.latency_ms;
+    let detections = resp
+        .detections
+        .into_iter()
+        .map(proto_detection_to_shared)
+        .collect();
+    DetectionBatch {
+        frame_seq,
+        detections,
+        latency_ms,
+        model_version,
+    }
+}
+
+fn proto_detection_to_shared(d: ProtoDetection) -> SharedDetection {
+    SharedDetection {
+        class_id: d.class_id,
+        class_name: d.class_name,
+        confidence: d.confidence,
+        bbox_normalized: bbox_to_shared(d.bbox_normalized.unwrap_or_default()),
+        mask_or_polyline: d.mask_or_polyline,
+        source_frame_seq: d.source_frame_seq,
+    }
+}
+
+fn bbox_to_shared(b: ProtoBoundingBox) -> BoundingBox {
+    BoundingBox {
+        x_min: b.x_min,
+        y_min: b.y_min,
+        x_max: b.x_max,
+        y_max: b.y_max,
+    }
+}
@@ -0,0 +1,129 @@
+//! AZ-660 + AZ-661 — atomic counter surface for `DetectionClient`.
+//!
+//! `description.md §3` requires:
+//! - `gRPC_connection_state` (watch, not in this struct — see
+//!   `runtime.rs`)
+//! - `requests_in_flight` (atomic gauge maintained by the supervisor)
+//! - `latency_p50`, `latency_p99` (live in [`crate::internal::latency`])
+//! - `errors_by_kind` (counters per kind, this struct)
+//! - `budget_drops_total` (this struct)
+//!
+//! AZ-661 adds:
+//! - `schema_mismatch_total` (one of the `errors_by_kind` buckets,
+//!   surfaced explicitly because it is the loudest failure mode)
+//! - `model_version_changes_total` (visibility for the operator UI)
+
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+
+/// Lock-free counters shared between the supervisor task and the
+/// `DetectionClientHandle`. Every field is `AtomicU64`; readers
+/// snapshot independently with `Ordering::Relaxed`.
+#[derive(Debug, Default)]
+pub struct DetectionStats {
+    pub requests_sent_total: AtomicU64,
+    pub responses_received_total: AtomicU64,
+    pub budget_drops_total: AtomicU64,
+    pub frame_lag_total: AtomicU64,
+    pub schema_mismatch_total: AtomicU64,
+    pub model_version_changes_total: AtomicU64,
+    pub reconnects_total: AtomicU64,
+    pub connect_errors_total: AtomicU64,
+    pub stream_errors_total: AtomicU64,
+    pub requests_in_flight: AtomicU64,
+    pub ai_locked_skipped_total: AtomicU64,
+}
+
+impl DetectionStats {
+    pub fn shared() -> Arc<Self> {
+        Arc::new(Self::default())
+    }
+
+    pub fn note_sent(&self) {
+        self.requests_sent_total.fetch_add(1, Ordering::Relaxed);
+        self.requests_in_flight.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_received(&self) {
+        self.responses_received_total
+            .fetch_add(1, Ordering::Relaxed);
+        // `requests_in_flight` decrements via `note_in_flight_dropped`
+        // on budget eviction and via this fn on a normal response.
+        self.requests_in_flight.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    pub fn note_in_flight_dropped(&self) {
+        self.budget_drops_total.fetch_add(1, Ordering::Relaxed);
+        self.requests_in_flight.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    pub fn note_orphan_response(&self) {
+        // Response arrived for a frame the budget already evicted.
+        // We do NOT decrement `requests_in_flight` here (the budget
+        // eviction already did) and we do NOT credit it against
+        // `responses_received_total` (it does not correspond to a
+        // currently-tracked in-flight request).
+        self.stream_errors_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_frame_lag(&self, n: u64) {
+        self.frame_lag_total.fetch_add(n, Ordering::Relaxed);
+    }
+
+    pub fn note_ai_locked_skipped(&self) {
+        self.ai_locked_skipped_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_schema_mismatch(&self) {
+        self.schema_mismatch_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_model_version_change(&self) {
+        self.model_version_changes_total
+            .fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_reconnect(&self) {
+        self.reconnects_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_connect_error(&self) {
+        self.connect_errors_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn note_stream_error(&self) {
+        self.stream_errors_total.fetch_add(1, Ordering::Relaxed);
+    }
+
+    pub fn requests_in_flight(&self) -> u64 {
+        self.requests_in_flight.load(Ordering::Relaxed)
+    }
+
+    pub fn budget_drops_total(&self) -> u64 {
+        self.budget_drops_total.load(Ordering::Relaxed)
+    }
+
+    pub fn requests_sent_total(&self) -> u64 {
+        self.requests_sent_total.load(Ordering::Relaxed)
+    }
+
+    pub fn responses_received_total(&self) -> u64 {
+        self.responses_received_total.load(Ordering::Relaxed)
+    }
+
+    pub fn schema_mismatch_total(&self) -> u64 {
+        self.schema_mismatch_total.load(Ordering::Relaxed)
+    }
+
+    pub fn model_version_changes_total(&self) -> u64 {
+        self.model_version_changes_total.load(Ordering::Relaxed)
+    }
+
+    pub fn reconnects_total(&self) -> u64 {
+        self.reconnects_total.load(Ordering::Relaxed)
+    }
+
+    pub fn ai_locked_skipped_total(&self) -> u64 {
+        self.ai_locked_skipped_total.load(Ordering::Relaxed)
+    }
+}
@@ -1,48 +1,274 @@
-//! `detection_client` — bi-directional gRPC to `../detections`.
+//! `detection_client` — bi-directional gRPC client to `../detections`.
 //!
-//! Real implementation lands in:
-//! - AZ-660 `detection_client_grpc_stream`
-//! - AZ-661 `detection_client_schema_and_health`
+//! AZ-660 wires the real `tonic` bi-directional stream + reconnect
+//! state machine + drop-oldest frame budgeting. AZ-661 layers schema
+//! validation, `model_version` tracking, and a sliding-window
+//! latency degradation signal on top.
+//!
+//! ## Public surface
+//!
+//! - [`DetectionClient`] / [`DetectionClientConfig`] — configuration
+//!   and entry-point. Build a config, hand it to
+//!   [`DetectionClient::new`], then start the supervisor with
+//!   [`DetectionClient::run`].
+//! - [`DetectionClientHandle`] — the cheap-clone handle returned
+//!   alongside the supervisor `JoinHandle`. Exposes the event stream,
+//!   health surface, connection state, and shutdown.
+//! - [`DetectionEvent`] — the union type emitted on the event stream
+//!   (a `tokio::sync::broadcast` channel so multiple consumers may
+//!   observe). Covers normal detection batches plus AZ-661 schema
+//!   mismatches, model-version changes, and Tier-1 latency
+//!   degradation transitions.
+//!
+//! The supervisor task lives in [`internal::runtime`]. It is the
+//! only owner of the gRPC channel; reconnects are bounded and the
+//! frame-source side never blocks on a slow gRPC server (drop-oldest
+//! budgeting per AC-3 of AZ-660).

-use shared::error::{AutopilotError, Result};
-use shared::health::ComponentHealth;
+use std::sync::Arc;
+use std::time::Duration;
+
+use tokio::sync::{broadcast, watch};
+use tokio::task::JoinHandle;
+
+use shared::health::{ComponentHealth, HealthLevel};
 use shared::models::detection::DetectionBatch;
 use shared::models::frame::Frame;

+pub mod internal;
+
+pub use internal::latency::DegradationTransition;
+pub use internal::stats::DetectionStats;
+
 const NAME: &str = "detection_client";

+/// Configuration for [`DetectionClient`]. Defaults match the
+/// `description.md §3` baseline (`max_concurrent_in_flight = 2`,
+/// 100 ms p99 Tier-1 threshold, 1 s → 30 s reconnect backoff,
+/// `expected_schema_version = 1`).
 #[derive(Debug, Clone)]
-pub struct DetectionClient {
+pub struct DetectionClientConfig {
    pub endpoint: String,
+    /// In-flight gRPC request budget. New frames evict the oldest
+    /// in-flight slot when this is reached (AC-3 of AZ-660).
+    pub max_concurrent_in_flight: usize,
+    pub connect_timeout: Duration,
+    pub reconnect_initial: Duration,
+    pub reconnect_cap: Duration,
+    /// Schema version the client was built against. Any response
+    /// with a different `schema_version` is a hard `SchemaMismatch`
+    /// (AC-1 of AZ-661).
+    pub expected_schema_version: u32,
+    /// Capacity of the outbound mpsc channel that feeds the gRPC
+    /// stream. Kept small so frames can't queue indefinitely on the
+    /// client side.
+    pub outbound_buffer: usize,
+    /// Capacity of the `events_tx` broadcast channel.
+    pub event_channel_capacity: usize,
+    /// Capacity of the sliding-window latency ring buffer (AZ-661).
+    pub latency_window_capacity: usize,
+    /// Tier-1 latency threshold (AC-3 of AZ-661). A `Tier1Degraded`
+    /// event is emitted when the sliding-window p99 crosses this
+    /// value; a `Tier1Recovered` event is emitted on the reverse
+    /// crossing.
+    pub latency_p99_threshold: Duration,
 }

-impl DetectionClient {
-    pub fn new(endpoint: String) -> Self {
-        Self { endpoint }
-    }
-
-    pub fn handle(&self) -> DetectionClientHandle {
-        DetectionClientHandle {
-            endpoint: self.endpoint.clone(),
+impl DetectionClientConfig {
+    pub fn new(endpoint: impl Into<String>) -> Self {
+        Self {
+            endpoint: endpoint.into(),
+            max_concurrent_in_flight: 2,
+            connect_timeout: Duration::from_secs(5),
+            reconnect_initial: Duration::from_secs(1),
+            reconnect_cap: Duration::from_secs(30),
+            expected_schema_version: 1,
+            outbound_buffer: 8,
+            event_channel_capacity: 64,
+            latency_window_capacity: 1024,
+            latency_p99_threshold: Duration::from_millis(100),
        }
    }
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ConnectionState {
+    Disconnected,
+    Connecting,
+    Connected,
+}
+
+#[derive(Debug, Clone)]
+pub enum DetectionEvent {
+    /// Normal happy-path output. `capture_ts_monotonic_ns` is the
+    /// frame's monotonic timestamp at the moment `frame_ingest`
+    /// captured it (forwarded so downstream consumers can correlate
+    /// detections back to the original frame without re-querying
+    /// `frame_ingest`). `server_latency` is the server-reported
+    /// per-frame processing time.
+    Batch {
+        batch: DetectionBatch,
+        capture_ts_monotonic_ns: u64,
+        server_latency: Duration,
+    },
+    /// AZ-661 AC-1 — `schema_version` on a response did not match
+    /// `DetectionClientConfig::expected_schema_version`. The
+    /// response is REJECTED — no detections are forwarded for that
+    /// frame.
+    SchemaMismatch {
+        detail: String,
+        frame_seq: u64,
+    },
+    /// AZ-661 AC-2 — server reported a `model_version` different
+    /// from the last observed one. `previous` is `None` only on the
+    /// very first response in the process lifetime.
+    ModelVersionChanged {
+        previous: Option<String>,
+        current: String,
+    },
+    /// AZ-661 AC-3 — sliding-window p99 latency crossed the
+    /// configured threshold UPWARDS. The next degraded → healthy
+    /// crossing emits a paired [`DetectionEvent::Tier1Recovered`].
+    Tier1Degraded {
+        reason: Tier1DegradationReason,
+    },
+    Tier1Recovered,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Tier1DegradationReason {
+    HighLatency,
+}
+
+/// Entry-point for the gRPC client. `new` is a builder; `run`
+/// consumes the client and spawns the supervisor task that owns the
+/// gRPC channel for the lifetime of the autopilot process.
+#[derive(Debug)]
+pub struct DetectionClient {
+    config: DetectionClientConfig,
+}
+
+impl DetectionClient {
+    pub fn new(config: DetectionClientConfig) -> Self {
+        Self { config }
+    }
+
+    /// Spawn the supervisor task. Returns the supervisor's
+    /// `JoinHandle<()>` and a cheap-clone [`DetectionClientHandle`]
+    /// that exposes the event stream, health surface, and
+    /// shutdown.
+    ///
+    /// The supervisor owns `frame_rx` for its full lifetime.
+    /// `frame_rx` is a `tokio::sync::broadcast::Receiver<Frame>` —
+    /// the composition root is responsible for wiring it to
+    /// `frame_ingest::FrameIngestHandle::subscribe()` (raw) or to
+    /// a `FrameReceiver` forwarder if it wants per-consumer drop
+    /// attribution on the publisher side.
+    pub fn run(
+        self,
+        frame_rx: broadcast::Receiver<Frame>,
+    ) -> (JoinHandle<()>, DetectionClientHandle) {
+        let (events_tx, _) = broadcast::channel(self.config.event_channel_capacity.max(1));
+        let (connection_tx, connection_rx) = watch::channel(ConnectionState::Disconnected);
+        let (shutdown_tx, shutdown_rx) = watch::channel(false);
+        let stats = DetectionStats::shared();
+        let latency = Arc::new(internal::latency::LatencyWindow::with_capacity(
+            self.config.latency_p99_threshold,
+            self.config.latency_window_capacity,
+        ));
+
+        let join = internal::runtime::spawn_supervisor(
+            self.config.clone(),
+            frame_rx,
+            events_tx.clone(),
+            Arc::clone(&stats),
+            Arc::clone(&latency),
+            connection_tx,
+            shutdown_rx,
+        );
+
+        let handle = DetectionClientHandle {
+            stats,
+            latency,
+            connection_state_rx: connection_rx,
+            events_tx,
+            shutdown_tx,
+        };
+
+        (join, handle)
+    }
+}
+
+/// Cheap-clone handle for the `DetectionClient` supervisor. Exposes:
+/// - Event subscription via [`Self::subscribe_events`].
+/// - Connection-state watch via [`Self::connection_state`] /
+///   [`Self::connection_state_stream`].
+/// - Health surface (`description.md §3`) via [`Self::health`].
+/// - Shutdown via [`Self::shutdown`] (idempotent).
 #[derive(Debug, Clone)]
 pub struct DetectionClientHandle {
-    #[allow(dead_code)]
-    endpoint: String,
+    stats: Arc<DetectionStats>,
+    latency: Arc<internal::latency::LatencyWindow>,
+    connection_state_rx: watch::Receiver<ConnectionState>,
+    events_tx: broadcast::Sender<DetectionEvent>,
+    shutdown_tx: watch::Sender<bool>,
 }

 impl DetectionClientHandle {
-    pub async fn request(&self, _frame: Frame) -> Result<DetectionBatch> {
-        Err(AutopilotError::NotImplemented(
-            "detection_client::request (AZ-660)",
-        ))
+    /// Subscribe to the [`DetectionEvent`] stream. The broadcast
+    /// channel applies its own drop-oldest back-pressure to slow
+    /// consumers; new subscribers see events emitted after they
+    /// subscribed.
+    pub fn subscribe_events(&self) -> broadcast::Receiver<DetectionEvent> {
+        self.events_tx.subscribe()
+    }
+
+    pub fn connection_state(&self) -> ConnectionState {
+        *self.connection_state_rx.borrow()
+    }
+
+    pub fn connection_state_stream(&self) -> watch::Receiver<ConnectionState> {
+        self.connection_state_rx.clone()
+    }
+
+    pub fn stats(&self) -> Arc<DetectionStats> {
+        Arc::clone(&self.stats)
+    }
+
+    pub fn latency_p50(&self) -> Option<Duration> {
+        self.latency.p50()
+    }
+
+    pub fn latency_p99(&self) -> Option<Duration> {
+        self.latency.p99()
+    }
+
+    pub fn shutdown(&self) {
+        self.shutdown_tx.send_replace(true);
    }

    pub fn health(&self) -> ComponentHealth {
-        ComponentHealth::disabled(NAME)
+        let state = self.connection_state();
+        match state {
+            ConnectionState::Disconnected => ComponentHealth::red(NAME, "disconnected"),
+            ConnectionState::Connecting => ComponentHealth::yellow(NAME, "connecting"),
+            ConnectionState::Connected => {
+                // `description.md §3` — p99 above threshold is the
+                // operative health signal once we're connected.
+                let mut h = ComponentHealth::green(NAME);
+                if let Some(p99) = self.latency.p99() {
+                    if p99 > self.latency.threshold() {
+                        h.level = HealthLevel::Yellow;
+                        h.detail = Some(format!(
+                            "p99 {} ms > threshold {} ms",
+                            p99.as_millis(),
+                            self.latency.threshold().as_millis()
+                        ));
+                    }
+                }
+                h
+            }
+        }
    }
 }

@@ -51,8 +277,14 @@ mod tests {
    use super::*;

    #[test]
-    fn it_compiles() {
-        let h = DetectionClient::new("http://127.0.0.1:50051".into()).handle();
-        assert_eq!(h.health().level, shared::health::HealthLevel::Disabled);
+    fn config_defaults_match_description() {
+        // Arrange
+        let c = DetectionClientConfig::new("http://127.0.0.1:50051");
+
+        // Assert — the §3 baseline numbers.
+        assert_eq!(c.max_concurrent_in_flight, 2);
+        assert_eq!(c.reconnect_cap, Duration::from_secs(30));
+        assert_eq!(c.expected_schema_version, 1);
+        assert_eq!(c.latency_p99_threshold, Duration::from_millis(100));
    }
 }
@@ -0,0 +1,551 @@
+//! AZ-660 + AZ-661 integration tests — fixture in-process gRPC server.
+//!
+//! AC-660-1 takes ~10 s; all others complete in ≤5 s.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use tokio::sync::{broadcast, mpsc, oneshot};
+use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream};
+use tonic::transport::Server;
+use tonic::{Request, Response, Status};
+
+use detection_client::internal::proto::{
+    detection_service_server::{DetectionService, DetectionServiceServer},
+    DetectionResponse, FrameRequest,
+};
+use detection_client::{ConnectionState, DetectionClient, DetectionClientConfig, DetectionEvent};
+use shared::models::frame::{Frame, PixelFormat};
+
+// ---------------------------------------------------------------------------
+// Frame factory
+// ---------------------------------------------------------------------------
+
+fn make_frame(seq: u64, ai_locked: bool) -> Frame {
+    Frame {
+        seq,
+        capture_ts_monotonic_ns: seq * 33_333_333,
+        decode_ts_monotonic_ns: seq * 33_333_333 + 1_000_000,
+        pixels: Arc::new(Bytes::from_static(b"\x80")),
+        width: 1,
+        height: 1,
+        pix_fmt: PixelFormat::Nv12,
+        ai_locked,
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Fixture: configurable echo server
+//
+// `close_after` is per-stream-session (reset on each `stream()` call) so the
+// server can be re-used across reconnects without freezing on the second
+// session.
+// ---------------------------------------------------------------------------
+
+#[derive(Clone)]
+struct FixtureServer {
+    latency_ms: u64,
+    schema_version: u32,
+    model_version: String,
+    close_after: Option<u32>,
+}
+
+impl FixtureServer {
+    fn fast() -> Self {
+        Self {
+            latency_ms: 10,
+            schema_version: 1,
+            model_version: "v1.0".to_string(),
+            close_after: None,
+        }
+    }
+    fn slow(latency_ms: u64) -> Self {
+        Self {
+            latency_ms,
+            ..Self::fast()
+        }
+    }
+    fn with_schema_version(mut self, v: u32) -> Self {
+        self.schema_version = v;
+        self
+    }
+    fn with_close_after(mut self, n: u32) -> Self {
+        self.close_after = Some(n);
+        self
+    }
+}
+
+#[async_trait]
+impl DetectionService for FixtureServer {
+    type StreamStream = ReceiverStream<Result<DetectionResponse, Status>>;
+
+    async fn stream(
+        &self,
+        request: Request<tonic::Streaming<FrameRequest>>,
+    ) -> Result<Response<Self::StreamStream>, Status> {
+        let latency = Duration::from_millis(self.latency_ms);
+        let schema_version = self.schema_version;
+        let model_version = self.model_version.clone();
+        let close_after = self.close_after;
+        let mut inbound = request.into_inner();
+        let (tx, rx) = mpsc::channel::<Result<DetectionResponse, Status>>(32);
+
+        tokio::spawn(async move {
+            let mut session_count = 0u32;
+            while let Ok(Some(req)) = inbound.message().await {
+                tokio::time::sleep(latency).await;
+                session_count += 1;
+                let resp = DetectionResponse {
+                    schema_version,
+                    model_version: model_version.clone(),
+                    frame_seq: req.frame_seq,
+                    latency_ms: latency.as_millis() as u32,
+                    detections: vec![],
+                };
+                if tx.send(Ok(resp)).await.is_err() {
+                    break;
+                }
+                if close_after.map(|n| session_count >= n).unwrap_or(false) {
+                    break;
+                }
+            }
+        });
+
+        Ok(Response::new(ReceiverStream::new(rx)))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Fixture: server that switches model_version mid-stream
+// ---------------------------------------------------------------------------
+
+#[derive(Clone)]
+struct VersionSwitchServer {
+    first_model: String,
+    second_model: String,
+    /// Return `first_model` for the first `switch_after` responses, then
+    /// `second_model` for all subsequent ones within the SAME session.
+    switch_after: u32,
+}
+
+#[async_trait]
+impl DetectionService for VersionSwitchServer {
+    type StreamStream = ReceiverStream<Result<DetectionResponse, Status>>;
+
+    async fn stream(
+        &self,
+        request: Request<tonic::Streaming<FrameRequest>>,
+    ) -> Result<Response<Self::StreamStream>, Status> {
+        let first = self.first_model.clone();
+        let second = self.second_model.clone();
+        let switch_after = self.switch_after;
+        let mut inbound = request.into_inner();
+        let (tx, rx) = mpsc::channel::<Result<DetectionResponse, Status>>(32);
+
+        tokio::spawn(async move {
+            let mut count = 0u32;
+            while let Ok(Some(req)) = inbound.message().await {
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                let model = if count < switch_after {
+                    first.clone()
+                } else {
+                    second.clone()
+                };
+                count += 1;
+                let resp = DetectionResponse {
+                    schema_version: 1,
+                    model_version: model,
+                    frame_seq: req.frame_seq,
+                    latency_ms: 10,
+                    detections: vec![],
+                };
+                if tx.send(Ok(resp)).await.is_err() {
+                    break;
+                }
+            }
+        });
+
+        Ok(Response::new(ReceiverStream::new(rx)))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Server harness
+// ---------------------------------------------------------------------------
+
+async fn start_server_with<S>(svc: S) -> (String, oneshot::Sender<()>)
+where
+    S: DetectionService + Clone + Send + Sync + 'static,
+{
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let stream = TcpListenerStream::new(listener);
+    let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
+
+    tokio::spawn(async move {
+        Server::builder()
+            .add_service(DetectionServiceServer::new(svc))
+            .serve_with_incoming_shutdown(stream, async {
+                let _ = shutdown_rx.await;
+            })
+            .await
+            .unwrap();
+    });
+
+    (format!("http://{addr}"), shutdown_tx)
+}
+
+async fn wait_connected(handle: &detection_client::DetectionClientHandle) {
+    let mut conn = handle.connection_state_stream();
+    tokio::time::timeout(Duration::from_secs(5), async {
+        loop {
+            if *conn.borrow() == ConnectionState::Connected {
+                break;
+            }
+            let _ = conn.changed().await;
+        }
+    })
+    .await
+    .expect("client connected within 5 s");
+}
+
+// ---------------------------------------------------------------------------
+// AZ-660 AC-1 — happy path, 30 fps for 10 s, ≥285 batches, p99 ≤100 ms
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac660_1_happy_path_30fps_285_batches() {
+    // Arrange
+    let (endpoint, _shutdown) = start_server_with(FixtureServer::fast()).await;
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(512);
+    let config = DetectionClientConfig::new(endpoint);
+    let (_join, handle) = DetectionClient::new(config).run(frame_rx);
+    wait_connected(&handle).await;
+
+    let mut events = handle.subscribe_events();
+    let collector = tokio::spawn(async move {
+        let mut count = 0u64;
+        loop {
+            match tokio::time::timeout(Duration::from_secs(2), events.recv()).await {
+                Ok(Ok(DetectionEvent::Batch { .. })) => count += 1,
+                Ok(Ok(_)) => {}
+                _ => break,
+            }
+        }
+        count
+    });
+
+    // Act — 30 fps for 10 s
+    let mut ticker = tokio::time::interval(Duration::from_nanos(33_333_333));
+    ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+    let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
+    let mut seq = 0u64;
+    loop {
+        ticker.tick().await;
+        if tokio::time::Instant::now() >= deadline {
+            break;
+        }
+        let _ = frame_tx.send(make_frame(seq, false));
+        seq += 1;
+    }
+    tokio::time::sleep(Duration::from_millis(500)).await;
+    handle.shutdown();
+
+    let batch_count = tokio::time::timeout(Duration::from_secs(3), collector)
+        .await
+        .expect("collector timed out")
+        .expect("collector panicked");
+
+    // Assert
+    assert!(
+        batch_count >= 285,
+        "expected ≥285 batches, got {batch_count}"
+    );
+    assert_eq!(
+        handle.stats().budget_drops_total(),
+        0,
+        "expected no budget drops"
+    );
+    if let Some(p99) = handle.latency_p99() {
+        assert!(p99 <= Duration::from_millis(100), "p99 {p99:?} > 100 ms");
+    }
+}
+
+// ---------------------------------------------------------------------------
+// AZ-660 AC-2 — reconnect after server closes stream
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac660_2_reconnects_after_stream_close() {
+    // The FixtureServer closes each stream-session after 3 responses; the
+    // client must reconnect and continue receiving within 2 s.
+    let (endpoint, _shutdown) = start_server_with(FixtureServer::fast().with_close_after(3)).await;
+
+    let config = DetectionClientConfig {
+        reconnect_initial: Duration::from_millis(100),
+        reconnect_cap: Duration::from_millis(500),
+        ..DetectionClientConfig::new(endpoint)
+    };
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(64);
+    let (_join, handle) = DetectionClient::new(config).run(frame_rx);
+    wait_connected(&handle).await;
+
+    let mut events = handle.subscribe_events();
+
+    // Send 3 frames → server closes stream after the 3rd response.
+    for i in 0u64..3 {
+        let _ = frame_tx.send(make_frame(i, false));
+        tokio::time::sleep(Duration::from_millis(25)).await;
+    }
+    // Give the stream-close time to propagate and the reconnect to happen.
+    tokio::time::sleep(Duration::from_millis(300)).await;
+
+    // Wait up to 2 s for the client to reconnect (AC-2 requirement).
+    let mut conn = handle.connection_state_stream();
+    tokio::time::timeout(Duration::from_secs(2), async {
+        loop {
+            if *conn.borrow() == ConnectionState::Connected {
+                break;
+            }
+            let _ = conn.changed().await;
+        }
+    })
+    .await
+    .expect("reconnected within 2 s");
+
+    // Verify frames continue to flow after reconnect.
+    for i in 3u64..6 {
+        let _ = frame_tx.send(make_frame(i, false));
+        tokio::time::sleep(Duration::from_millis(25)).await;
+    }
+    let post_reconnect_batch = tokio::time::timeout(Duration::from_secs(2), async {
+        loop {
+            match events.recv().await {
+                Ok(DetectionEvent::Batch { .. }) => return true,
+                Ok(_) => {}
+                Err(_) => return false,
+            }
+        }
+    })
+    .await
+    .unwrap_or(false);
+
+    // Assert
+    assert!(post_reconnect_batch, "frames flow after reconnect");
+    // Same model version on reconnect must NOT fire a second ModelVersionChanged.
+    let model_changes = handle.stats().model_version_changes_total();
+    assert_eq!(
+        model_changes, 1,
+        "same model version across reconnect must not repeat the event"
+    );
+
+    handle.shutdown();
+}
+
+// ---------------------------------------------------------------------------
+// AZ-660 AC-3 — budget drops on slow server (200 ms latency, 30 fps source)
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac660_3_budget_drops_on_slow_server() {
+    // Arrange
+    let (endpoint, _shutdown) = start_server_with(FixtureServer::slow(200)).await;
+    let config = DetectionClientConfig {
+        max_concurrent_in_flight: 2,
+        ..DetectionClientConfig::new(endpoint)
+    };
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(512);
+    let (_join, handle) = DetectionClient::new(config).run(frame_rx);
+    wait_connected(&handle).await;
+
+    // Act — 30 fps for 5 s; server takes 200 ms → budget full after frame 2.
+    let mut ticker = tokio::time::interval(Duration::from_nanos(33_333_333));
+    ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+    let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+    let mut seq = 0u64;
+    loop {
+        ticker.tick().await;
+        if tokio::time::Instant::now() >= deadline {
+            break;
+        }
+        let _ = frame_tx.send(make_frame(seq, false));
+        seq += 1;
+    }
+    tokio::time::sleep(Duration::from_millis(300)).await;
+    handle.shutdown();
+
+    // Assert
+    let drops = handle.stats().budget_drops_total();
+    assert!(drops > 0, "expected budget_drops > 0, got 0");
+}
+
+// ---------------------------------------------------------------------------
+// AZ-660 AC-4 — ai_locked frames are skipped
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac660_4_ai_locked_frames_skipped() {
+    // Arrange
+    let (endpoint, _shutdown) = start_server_with(FixtureServer::fast()).await;
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(256);
+    let (_join, handle) = DetectionClient::new(DetectionClientConfig::new(endpoint)).run(frame_rx);
+    wait_connected(&handle).await;
+
+    // Act — 20 frames; every 5th is ai_locked (frames 4, 9, 14, 19 → 4 locked).
+    for i in 0u64..20 {
+        let ai_locked = (i + 1) % 5 == 0;
+        let _ = frame_tx.send(make_frame(i, ai_locked));
+        tokio::time::sleep(Duration::from_millis(15)).await;
+    }
+    tokio::time::sleep(Duration::from_millis(300)).await;
+    handle.shutdown();
+
+    // Assert
+    let skipped = handle.stats().ai_locked_skipped_total();
+    let sent = handle.stats().requests_sent_total();
+    assert_eq!(skipped, 4, "expected 4 ai_locked skips, got {skipped}");
+    assert!(sent <= 16, "expected ≤16 requests sent, got {sent}");
+}
+
+// ---------------------------------------------------------------------------
+// AZ-661 AC-1 — schema mismatch surfaces as hard error + counter
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac661_1_schema_mismatch_hard_error() {
+    // Arrange — server returns schema_version 99 (incompatible with expected 1).
+    let (endpoint, _shutdown) =
+        start_server_with(FixtureServer::fast().with_schema_version(99)).await;
+    let config = DetectionClientConfig {
+        expected_schema_version: 1,
+        ..DetectionClientConfig::new(endpoint)
+    };
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(64);
+    let (_join, handle) = DetectionClient::new(config).run(frame_rx);
+    let mut events = handle.subscribe_events();
+    wait_connected(&handle).await;
+
+    // Act
+    let _ = frame_tx.send(make_frame(1, false));
+
+    // Assert — SchemaMismatch event emitted and counter increments.
+    let got_mismatch = tokio::time::timeout(Duration::from_secs(2), async {
+        loop {
+            match events.recv().await {
+                Ok(DetectionEvent::SchemaMismatch { .. }) => return true,
+                Ok(_) => {}
+                Err(_) => return false,
+            }
+        }
+    })
+    .await
+    .unwrap_or(false);
+
+    assert!(got_mismatch, "expected SchemaMismatch event");
+    assert!(
+        handle.stats().schema_mismatch_total() >= 1,
+        "expected schema_mismatch_total ≥ 1"
+    );
+
+    handle.shutdown();
+}
+
+// ---------------------------------------------------------------------------
+// AZ-661 AC-2 — model_version change is signalled exactly once
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac661_2_model_version_change_emits_event() {
+    // Arrange — server returns "v1.2" for the first response, then "v1.3".
+    let (endpoint, _shutdown) = start_server_with(VersionSwitchServer {
+        first_model: "v1.2".to_string(),
+        second_model: "v1.3".to_string(),
+        switch_after: 1,
+    })
+    .await;
+
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(64);
+    let (_join, handle) = DetectionClient::new(DetectionClientConfig::new(endpoint)).run(frame_rx);
+    let mut events = handle.subscribe_events();
+    wait_connected(&handle).await;
+
+    // Act — send 5 frames; responses 1 = "v1.2", responses 2-5 = "v1.3".
+    for i in 0u64..5 {
+        let _ = frame_tx.send(make_frame(i, false));
+        tokio::time::sleep(Duration::from_millis(20)).await;
+    }
+
+    // Drain all pending events within a 500 ms window.
+    let mut v13_events = 0u32;
+    let drain_deadline = tokio::time::Instant::now() + Duration::from_millis(500);
+    loop {
+        let remaining = drain_deadline.saturating_duration_since(tokio::time::Instant::now());
+        if remaining.is_zero() {
+            break;
+        }
+        match tokio::time::timeout(remaining, events.recv()).await {
+            Ok(Ok(DetectionEvent::ModelVersionChanged { current, .. })) => {
+                if current == "v1.3" {
+                    v13_events += 1;
+                }
+            }
+            Ok(Ok(_)) => {}
+            _ => break,
+        }
+    }
+
+    handle.shutdown();
+
+    // Assert — exactly one transition to "v1.3".
+    assert_eq!(
+        v13_events, 1,
+        "expected exactly one ModelVersionChanged(v1.3), got {v13_events}"
+    );
+}
+
+// ---------------------------------------------------------------------------
+// AZ-661 AC-3 — Tier1Degraded emitted exactly once on latency spike
+// ---------------------------------------------------------------------------
+
+#[tokio::test(flavor = "multi_thread")]
+async fn ac661_3_tier1_degraded_emitted_once_on_latency_spike() {
+    // Arrange — small latency window (8 samples) so the window fills quickly;
+    // server latency 150 ms > threshold 100 ms.
+    let (endpoint, _shutdown) = start_server_with(FixtureServer::slow(150)).await;
+    let config = DetectionClientConfig {
+        latency_window_capacity: 8,
+        latency_p99_threshold: Duration::from_millis(100),
+        ..DetectionClientConfig::new(endpoint)
+    };
+    let (frame_tx, frame_rx) = broadcast::channel::<Frame>(64);
+    let (_join, handle) = DetectionClient::new(config).run(frame_rx);
+    let mut events = handle.subscribe_events();
+    wait_connected(&handle).await;
+
+    // Act — send 10 frames; server responds in 150 ms each.
+    // The latency window (capacity 8) will be full of 150 ms samples after
+    // 8 responses; p99 = 150 ms > 100 ms → exactly one Tier1Degraded event.
+    for i in 0u64..10 {
+        let _ = frame_tx.send(make_frame(i, false));
+        tokio::time::sleep(Duration::from_millis(160)).await;
+    }
+    handle.shutdown();
+
+    // Drain events.
+    let mut degraded_count = 0u32;
+    loop {
+        match events.try_recv() {
+            Ok(DetectionEvent::Tier1Degraded { .. }) => degraded_count += 1,
+            Err(_) => break,
+            Ok(_) => {}
+        }
+    }
+
+    // Assert — the latch fires exactly once per degraded→healthy transition.
+    assert_eq!(
+        degraded_count, 1,
+        "expected exactly one Tier1Degraded event, got {degraded_count}"
+    );
+}