mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-21 09:41:11 +00:00
[AZ-659] [AZ-660] [AZ-661] Implement frame publisher + gRPC detection client
AZ-659: FramePublisher with per-consumer drop accounting (Arc<Bytes> zero-copy fan-out). Adds ConsumerId enum, PublisherStats, FrameReceiver wrapper, and publisher integration tests (AC-1, AC-2, AC-3). AZ-660: Bi-directional tonic gRPC stream to ../detections. Reconnect with bounded exponential backoff (1 s → 30 s cap). Drop-oldest in-flight budgeting (max_concurrent_in_flight = 2). ai_locked frame skipping. Integration tests against fixture in-process server (AC-1: happy path 30 fps/10 s, AC-2: reconnect, AC-3: budget drops, AC-4: ai_locked skipping). AZ-661: Schema validation (hard SchemaMismatch error on version mismatch), model_version latch with ModelVersionChanged events, sliding-window p99 latency tracker with Tier1Degraded/Tier1Recovered transitions. Integration tests (AC-1, AC-2, AC-3). Also: update module-layout.md for frame_ingest and detection_client to reflect the streaming API shape; code review report batch_18. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -2,5 +2,6 @@
|
||||
|
||||
pub mod decoder;
|
||||
pub mod lifecycle;
|
||||
pub mod publisher;
|
||||
pub mod rtsp_client;
|
||||
pub mod timestamp;
|
||||
|
||||
@@ -0,0 +1,366 @@
|
||||
//! AZ-659 — multi-consumer frame publisher with per-consumer drop accounting.
|
||||
//!
|
||||
//! `FrameIngest` already fans out to multiple subscribers via
|
||||
//! `tokio::sync::broadcast`, but a raw broadcast receiver silently
|
||||
//! folds lag into a single `RecvError::Lagged(n)` return value. The
|
||||
//! lifecycle loop has no way to attribute those drops back to *which*
|
||||
//! consumer fell behind, and the operator UI cannot tell "the AI
|
||||
//! tier is slow" from "the modem is slow".
|
||||
//!
|
||||
//! This module wraps the broadcast hub with:
|
||||
//!
|
||||
//! - a `ConsumerId` enum that names the three known consumers per
|
||||
//! `description.md §3` (`detection_client`, `movement_detector`,
|
||||
//! `telemetry_stream`);
|
||||
//! - a `PublisherStats` struct holding one `AtomicU64` drop counter
|
||||
//! per consumer plus a total publish counter (lock-free; never
|
||||
//! blocks the lifecycle loop);
|
||||
//! - a `FrameReceiver` wrapper around `broadcast::Receiver<Frame>`
|
||||
//! that intercepts `RecvError::Lagged(n)` and folds it into the
|
||||
//! right per-consumer counter before silently retrying — drops
|
||||
//! are *counted*, never silent (`description.md §6` AC-2);
|
||||
//! - a `FramePublisher` struct that owns the broadcast `Sender` plus
|
||||
//! the stats handle, exposes `subscribe(ConsumerId)`, and is
|
||||
//! constructed with a configurable channel depth.
|
||||
//!
|
||||
//! The zero-copy property required by AC-3 lives in the `Frame`
|
||||
//! struct itself (`pixels: Arc<Bytes>`); the publisher does not
|
||||
//! copy the payload — the broadcast channel hands every subscriber
|
||||
//! the same `Arc`, so memory does not scale with consumer count.
|
||||
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::broadcast;
|
||||
|
||||
use shared::models::frame::Frame;
|
||||
|
||||
/// Default per-consumer channel depth (`description.md §3` —
|
||||
/// nominal queue depth before a slow consumer's oldest frame is
|
||||
/// dropped). Picked at 4 frames so a 30 fps pipeline survives a
|
||||
/// ~130 ms downstream stall without dropping anything; longer
|
||||
/// stalls drop until the consumer catches up.
|
||||
pub const DEFAULT_CHANNEL_DEPTH: usize = 4;
|
||||
|
||||
/// The three known downstream frame consumers. `non_exhaustive` so
|
||||
/// future additions (e.g. on-board recording) extend without
|
||||
/// breaking matchers.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[non_exhaustive]
|
||||
pub enum ConsumerId {
|
||||
DetectionClient,
|
||||
MovementDetector,
|
||||
Telemetry,
|
||||
}
|
||||
|
||||
impl ConsumerId {
|
||||
/// Canonical drop-reason tag emitted to logs and surfaced through
|
||||
/// `FrameIngestHandle::dropped_frames`. Format matches the
|
||||
/// `description.md §6` reason vocabulary so the operator UI's
|
||||
/// existing reason filter works without changes.
|
||||
pub fn drop_reason(self) -> &'static str {
|
||||
match self {
|
||||
Self::DetectionClient => "detection_client_slow",
|
||||
Self::MovementDetector => "movement_detector_slow",
|
||||
Self::Telemetry => "telemetry_slow",
|
||||
}
|
||||
}
|
||||
|
||||
/// Short identifier suitable for `tracing` fields.
|
||||
pub fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::DetectionClient => "detection_client",
|
||||
Self::MovementDetector => "movement_detector",
|
||||
Self::Telemetry => "telemetry_stream",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Lock-free counters consumed by `FrameIngestHandle::health` and by
|
||||
/// the operator-side per-consumer drop dashboard. Held inside an
|
||||
/// `Arc` and shared by the lifecycle task (writer side, via
|
||||
/// `FramePublisher::publish`) and every active `FrameReceiver`
|
||||
/// (writer side, via lag interception).
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PublisherStats {
|
||||
publishes_total: AtomicU64,
|
||||
detection_client_drops: AtomicU64,
|
||||
movement_detector_drops: AtomicU64,
|
||||
telemetry_drops: AtomicU64,
|
||||
}
|
||||
|
||||
impl PublisherStats {
|
||||
pub fn shared() -> Arc<Self> {
|
||||
Arc::new(Self::default())
|
||||
}
|
||||
|
||||
pub fn publishes_total(&self) -> u64 {
|
||||
self.publishes_total.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn drops_for(&self, consumer: ConsumerId) -> u64 {
|
||||
self.counter(consumer).load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn note_publish(&self) {
|
||||
self.publishes_total.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn note_drop(&self, consumer: ConsumerId, n: u64) {
|
||||
self.counter(consumer).fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn counter(&self, consumer: ConsumerId) -> &AtomicU64 {
|
||||
match consumer {
|
||||
ConsumerId::DetectionClient => &self.detection_client_drops,
|
||||
ConsumerId::MovementDetector => &self.movement_detector_drops,
|
||||
ConsumerId::Telemetry => &self.telemetry_drops,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Multi-consumer fan-out hub. Wraps a `tokio::sync::broadcast`
|
||||
/// sender with the per-consumer accounting needed by AC-2 of
|
||||
/// AZ-659. The channel capacity is the `channel_depth` configured
|
||||
/// at construction; the broadcast channel's natural overwrite
|
||||
/// behaviour gives the "drop oldest for the slow consumer" semantic
|
||||
/// the task spec requires.
|
||||
#[derive(Debug)]
|
||||
pub struct FramePublisher {
|
||||
tx: broadcast::Sender<Frame>,
|
||||
stats: Arc<PublisherStats>,
|
||||
channel_depth: usize,
|
||||
}
|
||||
|
||||
impl FramePublisher {
|
||||
pub fn new(channel_depth: usize) -> Self {
|
||||
let depth = channel_depth.max(1);
|
||||
let (tx, _rx) = broadcast::channel(depth);
|
||||
Self {
|
||||
tx,
|
||||
stats: PublisherStats::shared(),
|
||||
channel_depth: depth,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn channel_depth(&self) -> usize {
|
||||
self.channel_depth
|
||||
}
|
||||
|
||||
/// Snapshot accessor for the shared stats object. Cheap clone
|
||||
/// (one `Arc::clone`).
|
||||
pub fn stats(&self) -> Arc<PublisherStats> {
|
||||
Arc::clone(&self.stats)
|
||||
}
|
||||
|
||||
/// Subscribe under a named consumer identity. Per-consumer lag
|
||||
/// gets attributed to the named consumer's drop counter.
|
||||
pub fn subscribe(&self, consumer: ConsumerId) -> FrameReceiver {
|
||||
FrameReceiver {
|
||||
rx: self.tx.subscribe(),
|
||||
consumer,
|
||||
stats: Arc::clone(&self.stats),
|
||||
}
|
||||
}
|
||||
|
||||
/// Subscribe without per-consumer accounting. Use for code paths
|
||||
/// that don't fit into one of the three known consumer roles
|
||||
/// (e.g. test harnesses, ad-hoc inspection). Lag on these
|
||||
/// receivers is *not* counted toward any per-consumer total.
|
||||
pub fn subscribe_raw(&self) -> broadcast::Receiver<Frame> {
|
||||
self.tx.subscribe()
|
||||
}
|
||||
|
||||
/// Publish a frame. Returns the number of receivers that were
|
||||
/// subscribed at the moment the send happened (informational).
|
||||
/// Increments `publishes_total` even when there are zero
|
||||
/// subscribers — the publish *attempt* is what we measure for
|
||||
/// the §6 publish-rate dashboard.
|
||||
pub fn publish(&self, frame: Frame) -> usize {
|
||||
self.stats.note_publish();
|
||||
// `broadcast::Sender::send` returns `Err(SendError(_))` when
|
||||
// there are zero active receivers. That's a normal state
|
||||
// during start-up (consumers spawn slightly after the
|
||||
// publisher) and is not a failure — we treat the return
|
||||
// value purely as "how many consumers got this frame".
|
||||
self.tx.send(frame).unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Subscriber count snapshot — useful for health-server output
|
||||
/// ("AI tier was not subscribed when first frame arrived").
|
||||
pub fn receiver_count(&self) -> usize {
|
||||
self.tx.receiver_count()
|
||||
}
|
||||
}
|
||||
|
||||
/// `broadcast::Receiver<Frame>` wrapper that folds lag into the
|
||||
/// owning consumer's drop counter before transparently retrying.
|
||||
/// `recv()` only returns `Ok(Frame)` or a fatal `RecvError::Closed`
|
||||
/// — lag is never surfaced to the caller; it is recorded and the
|
||||
/// next available frame is returned.
|
||||
#[derive(Debug)]
|
||||
pub struct FrameReceiver {
|
||||
rx: broadcast::Receiver<Frame>,
|
||||
consumer: ConsumerId,
|
||||
stats: Arc<PublisherStats>,
|
||||
}
|
||||
|
||||
impl FrameReceiver {
|
||||
pub fn consumer(&self) -> ConsumerId {
|
||||
self.consumer
|
||||
}
|
||||
|
||||
/// Block until the next frame is available. On lag, record the
|
||||
/// drop count against this consumer and immediately retry; the
|
||||
/// caller never sees `Lagged`. The only error variant returned
|
||||
/// is `RecvError::Closed`, which means the publisher has been
|
||||
/// dropped.
|
||||
pub async fn recv(&mut self) -> Result<Frame, RecvError> {
|
||||
loop {
|
||||
match self.rx.recv().await {
|
||||
Ok(frame) => return Ok(frame),
|
||||
Err(broadcast::error::RecvError::Lagged(n)) => {
|
||||
self.note_lag(n);
|
||||
}
|
||||
Err(broadcast::error::RecvError::Closed) => return Err(RecvError::Closed),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-blocking variant. `Empty` is the channel-is-currently-empty
|
||||
/// case (no frames produced since the last `recv`/`try_recv`),
|
||||
/// not a fatal state. `Closed` mirrors the async variant.
|
||||
pub fn try_recv(&mut self) -> Result<Frame, TryRecvError> {
|
||||
loop {
|
||||
match self.rx.try_recv() {
|
||||
Ok(frame) => return Ok(frame),
|
||||
Err(broadcast::error::TryRecvError::Empty) => return Err(TryRecvError::Empty),
|
||||
Err(broadcast::error::TryRecvError::Closed) => return Err(TryRecvError::Closed),
|
||||
Err(broadcast::error::TryRecvError::Lagged(n)) => {
|
||||
self.note_lag(n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn note_lag(&self, n: u64) {
|
||||
self.stats.note_drop(self.consumer, n);
|
||||
tracing::warn!(
|
||||
consumer = self.consumer.as_str(),
|
||||
reason = self.consumer.drop_reason(),
|
||||
dropped = n,
|
||||
"frame_publisher dropped frames for slow consumer"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that `FrameReceiver::recv` can return. Lag is *not* in
|
||||
/// this list — it is accounted internally.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RecvError {
|
||||
#[error("frame publisher closed")]
|
||||
Closed,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum TryRecvError {
|
||||
#[error("no frame available")]
|
||||
Empty,
|
||||
#[error("frame publisher closed")]
|
||||
Closed,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use shared::models::frame::{Frame, PixelFormat};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_frame(seq: u64, payload: Arc<Bytes>) -> Frame {
|
||||
Frame {
|
||||
seq,
|
||||
capture_ts_monotonic_ns: seq * 1_000_000,
|
||||
decode_ts_monotonic_ns: seq * 1_000_000 + 100,
|
||||
pixels: payload,
|
||||
width: 320,
|
||||
height: 240,
|
||||
pix_fmt: PixelFormat::Nv12,
|
||||
ai_locked: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn channel_depth_defaults_to_at_least_one() {
|
||||
// Arrange
|
||||
let p = FramePublisher::new(0);
|
||||
|
||||
// Assert — broadcast::channel(0) would panic, so we clamp.
|
||||
assert!(p.channel_depth() >= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn drop_reason_matches_description_md_vocabulary() {
|
||||
assert_eq!(
|
||||
ConsumerId::DetectionClient.drop_reason(),
|
||||
"detection_client_slow"
|
||||
);
|
||||
assert_eq!(
|
||||
ConsumerId::MovementDetector.drop_reason(),
|
||||
"movement_detector_slow"
|
||||
);
|
||||
assert_eq!(ConsumerId::Telemetry.drop_reason(), "telemetry_slow");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn publish_increments_total_even_without_subscribers() {
|
||||
// Arrange
|
||||
let p = FramePublisher::new(DEFAULT_CHANNEL_DEPTH);
|
||||
let stats = p.stats();
|
||||
let payload = Arc::new(Bytes::from_static(&[0u8; 32]));
|
||||
|
||||
// Act
|
||||
for seq in 0..5 {
|
||||
p.publish(make_frame(seq, Arc::clone(&payload)));
|
||||
}
|
||||
|
||||
// Assert
|
||||
assert_eq!(stats.publishes_total(), 5);
|
||||
assert_eq!(stats.drops_for(ConsumerId::DetectionClient), 0);
|
||||
assert_eq!(stats.drops_for(ConsumerId::MovementDetector), 0);
|
||||
assert_eq!(stats.drops_for(ConsumerId::Telemetry), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn three_subscribers_share_arc_pixels_zero_copy() {
|
||||
// Arrange
|
||||
let p = FramePublisher::new(DEFAULT_CHANNEL_DEPTH);
|
||||
let mut det = p.subscribe(ConsumerId::DetectionClient);
|
||||
let mut mov = p.subscribe(ConsumerId::MovementDetector);
|
||||
let mut tel = p.subscribe(ConsumerId::Telemetry);
|
||||
let payload = Arc::new(Bytes::from(vec![0xABu8; 1024]));
|
||||
|
||||
// Act
|
||||
p.publish(make_frame(1, Arc::clone(&payload)));
|
||||
let f_det = det.recv().await.expect("det recv");
|
||||
let f_mov = mov.recv().await.expect("mov recv");
|
||||
let f_tel = tel.recv().await.expect("tel recv");
|
||||
|
||||
// Assert — every subscriber received the SAME `Arc<Bytes>`,
|
||||
// not a clone of the bytes.
|
||||
assert!(
|
||||
Arc::ptr_eq(&f_det.pixels, &f_mov.pixels),
|
||||
"det/mov must share the same Arc — broadcast must not deep-clone Bytes"
|
||||
);
|
||||
assert!(
|
||||
Arc::ptr_eq(&f_mov.pixels, &f_tel.pixels),
|
||||
"mov/tel must share the same Arc"
|
||||
);
|
||||
assert!(
|
||||
Arc::ptr_eq(&f_det.pixels, &payload),
|
||||
"received Arc must be the original payload pointer"
|
||||
);
|
||||
}
|
||||
}
|
||||
+110
-24
@@ -1,4 +1,4 @@
|
||||
//! `frame_ingest` — RTSP pull + decode + timestamp.
|
||||
//! `frame_ingest` — RTSP pull + decode + timestamp + publish.
|
||||
//!
|
||||
//! Real implementation lands in:
|
||||
//! - AZ-657 `frame_ingest_rtsp_session` — session lifecycle + bounded
|
||||
@@ -7,18 +7,31 @@
|
||||
//! fallback) + per-frame monotonic timestamping + decode stats
|
||||
//! (this crate, `internal/decoder.rs` + `internal/timestamp.rs`).
|
||||
//! - AZ-659 `frame_ingest_publisher` — bounded broadcast + per-consumer
|
||||
//! drop policy.
|
||||
//! drop policy (this crate, `internal/publisher.rs`).
|
||||
//!
|
||||
//! ## AZ-658 surface (extends AZ-657)
|
||||
//!
|
||||
//! `FrameIngest::run` now takes a [`FrameDecoder`]. The lifecycle loop
|
||||
//! `FrameIngest::run` takes a [`FrameDecoder`]. The lifecycle loop
|
||||
//! stamps the capture timestamp the moment a packet leaves the
|
||||
//! transport, hands the encoded payload to the decoder, and emits one
|
||||
//! [`Frame`] per decoded picture with `decode_ts_monotonic_ns` set
|
||||
//! when the decoder returned. Single-frame decode errors increment
|
||||
//! `decode_errors_total` and drop the frame; the stream is never
|
||||
//! aborted (AC-3). The decoder backend (`Nvdec` / `Software`) is
|
||||
//! observable via [`FrameIngestHandle::decoder_backend`].
|
||||
//! aborted. The decoder backend (`Nvdec` / `Software`) is observable
|
||||
//! via [`FrameIngestHandle::decoder_backend`].
|
||||
//!
|
||||
//! ## AZ-659 surface (extends AZ-658)
|
||||
//!
|
||||
//! Decoded frames flow through a [`FramePublisher`]. The publisher
|
||||
//! exposes [`FrameIngestHandle::subscribe_as`] for the three known
|
||||
//! consumers (`detection_client`, `movement_detector`,
|
||||
//! `telemetry_stream`); each subscriber's lag is folded into a
|
||||
//! per-consumer drop counter visible via
|
||||
//! [`FrameIngestHandle::dropped_frames`]. Drops are *counted* and
|
||||
//! `tracing::warn`-logged with a reason tag — never silent.
|
||||
//! `FrameIngestHandle::subscribe()` is preserved for legacy callers
|
||||
//! that don't fit one of the three named consumer roles; lag on
|
||||
//! those raw receivers is not attributed to any consumer counter.
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
@@ -38,6 +51,10 @@ pub use internal::decoder::{
|
||||
FfmpegDecoder, FrameDecoder,
|
||||
};
|
||||
pub use internal::lifecycle::{BackoffPolicy, LifecycleStats, SessionState};
|
||||
pub use internal::publisher::{
|
||||
ConsumerId, FramePublisher, FrameReceiver, PublisherStats, RecvError as FrameRecvError,
|
||||
TryRecvError as FrameTryRecvError, DEFAULT_CHANNEL_DEPTH,
|
||||
};
|
||||
pub use internal::rtsp_client::{
|
||||
OpenError, RtspPacket, RtspSessionConfig, RtspTransport, RtspTransportHint, StreamError,
|
||||
};
|
||||
@@ -53,7 +70,7 @@ const NAME: &str = "frame_ingest";
|
||||
const RED_FRAME_AGE: Duration = Duration::from_secs(5);
|
||||
|
||||
pub struct FrameIngest {
|
||||
tx: broadcast::Sender<Frame>,
|
||||
publisher: Arc<FramePublisher>,
|
||||
ai_lock_tx: watch::Sender<bool>,
|
||||
state_tx: watch::Sender<SessionState>,
|
||||
shutdown_tx: watch::Sender<bool>,
|
||||
@@ -65,6 +82,10 @@ pub struct FrameIngest {
|
||||
}
|
||||
|
||||
impl FrameIngest {
|
||||
/// Default constructor — `channel_capacity` maps directly to the
|
||||
/// publisher's `channel_depth` (see `description.md §3`). Use
|
||||
/// [`Self::with_backoff`] when both the depth and the reopen
|
||||
/// backoff need to be customised.
|
||||
pub fn new(channel_capacity: usize) -> Self {
|
||||
Self::with_backoff(
|
||||
channel_capacity,
|
||||
@@ -73,13 +94,13 @@ impl FrameIngest {
|
||||
}
|
||||
|
||||
pub fn with_backoff(channel_capacity: usize, backoff: BackoffPolicy) -> Self {
|
||||
let (tx, _rx) = broadcast::channel(channel_capacity);
|
||||
let publisher = Arc::new(FramePublisher::new(channel_capacity));
|
||||
let (ai_lock_tx, _) = watch::channel(false);
|
||||
let (state_tx, _) = watch::channel(SessionState::Closed);
|
||||
let (shutdown_tx, _) = watch::channel(false);
|
||||
let (backend_tx, _) = watch::channel(None);
|
||||
Self {
|
||||
tx,
|
||||
publisher,
|
||||
ai_lock_tx,
|
||||
state_tx,
|
||||
shutdown_tx,
|
||||
@@ -91,9 +112,18 @@ impl FrameIngest {
|
||||
}
|
||||
}
|
||||
|
||||
/// Shared accessor for the underlying [`FramePublisher`]. The
|
||||
/// composition root passes this `Arc` to consumers that prefer
|
||||
/// to subscribe themselves (named via [`ConsumerId`]) rather
|
||||
/// than receiving a pre-built [`FrameReceiver`] over the
|
||||
/// handle.
|
||||
pub fn publisher(&self) -> Arc<FramePublisher> {
|
||||
Arc::clone(&self.publisher)
|
||||
}
|
||||
|
||||
pub fn handle(&self) -> FrameIngestHandle {
|
||||
FrameIngestHandle {
|
||||
tx: self.tx.clone(),
|
||||
publisher: Arc::clone(&self.publisher),
|
||||
ai_lock_tx: self.ai_lock_tx.clone(),
|
||||
state_rx: self.state_tx.subscribe(),
|
||||
shutdown_tx: self.shutdown_tx.clone(),
|
||||
@@ -115,7 +145,7 @@ impl FrameIngest {
|
||||
T: RtspTransport + 'static,
|
||||
D: FrameDecoder + 'static,
|
||||
{
|
||||
let tx = self.tx.clone();
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let ai_lock = self.ai_lock_tx.subscribe();
|
||||
let state_tx = self.state_tx.clone();
|
||||
let backend_tx = self.backend_tx.clone();
|
||||
@@ -135,7 +165,7 @@ impl FrameIngest {
|
||||
transport,
|
||||
decoder,
|
||||
config,
|
||||
tx,
|
||||
publisher,
|
||||
ai_lock,
|
||||
state_tx,
|
||||
shutdown_rx,
|
||||
@@ -158,7 +188,7 @@ async fn lifecycle_loop<T>(
|
||||
transport: Arc<Mutex<T>>,
|
||||
mut decoder: Box<dyn FrameDecoder + Send>,
|
||||
config: RtspSessionConfig,
|
||||
tx: broadcast::Sender<Frame>,
|
||||
publisher: Arc<FramePublisher>,
|
||||
mut ai_lock: watch::Receiver<bool>,
|
||||
state_tx: watch::Sender<SessionState>,
|
||||
mut shutdown_rx: watch::Receiver<bool>,
|
||||
@@ -250,12 +280,14 @@ async fn lifecycle_loop<T>(
|
||||
pix_fmt: dp.pix_fmt,
|
||||
ai_locked: locked,
|
||||
};
|
||||
// Send errors are no-ops when
|
||||
// the broadcast has no
|
||||
// subscribers; per-consumer
|
||||
// back-pressure is AZ-659's
|
||||
// problem.
|
||||
let _ = tx.send(frame);
|
||||
// The publisher folds lag
|
||||
// into per-consumer drop
|
||||
// counters; the lifecycle
|
||||
// loop never blocks on a
|
||||
// slow consumer. Return
|
||||
// value (subscriber count)
|
||||
// is informational.
|
||||
publisher.publish(frame);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -309,7 +341,7 @@ async fn lifecycle_loop<T>(
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FrameIngestHandle {
|
||||
tx: broadcast::Sender<Frame>,
|
||||
publisher: Arc<FramePublisher>,
|
||||
ai_lock_tx: watch::Sender<bool>,
|
||||
state_rx: watch::Receiver<SessionState>,
|
||||
shutdown_tx: watch::Sender<bool>,
|
||||
@@ -320,12 +352,47 @@ pub struct FrameIngestHandle {
|
||||
}
|
||||
|
||||
impl FrameIngestHandle {
|
||||
/// Subscribe to the frame stream. Consumers receive every frame
|
||||
/// after they subscribed; back-pressure is implemented via
|
||||
/// broadcast channel lag (see AZ-659 for the slow-consumer
|
||||
/// policy).
|
||||
/// Raw, unaccounted subscription. Used by legacy callers and
|
||||
/// tests that don't fit one of the three named [`ConsumerId`]
|
||||
/// roles. Lag on this receiver is *not* attributed to any
|
||||
/// per-consumer drop counter — prefer [`Self::subscribe_as`] for
|
||||
/// production consumers so the per-consumer drop dashboard
|
||||
/// stays accurate.
|
||||
pub fn subscribe(&self) -> broadcast::Receiver<Frame> {
|
||||
self.tx.subscribe()
|
||||
self.publisher.subscribe_raw()
|
||||
}
|
||||
|
||||
/// Subscribe under a named consumer identity. Per-consumer lag
|
||||
/// is folded into the matching drop counter and surfaced via
|
||||
/// [`Self::dropped_frames`]. The returned [`FrameReceiver`]
|
||||
/// transparently retries past lag so callers never observe
|
||||
/// `Lagged` — they only see the next available frame.
|
||||
pub fn subscribe_as(&self, consumer: ConsumerId) -> FrameReceiver {
|
||||
self.publisher.subscribe(consumer)
|
||||
}
|
||||
|
||||
/// Shared accessor for the underlying [`FramePublisher`]. Useful
|
||||
/// when a consumer needs to subscribe multiple times (e.g.
|
||||
/// reopening a receiver after a transient logical reset) without
|
||||
/// holding the full ingest handle.
|
||||
pub fn publisher(&self) -> Arc<FramePublisher> {
|
||||
Arc::clone(&self.publisher)
|
||||
}
|
||||
|
||||
/// Per-consumer drop counter. Increments by `n` every time the
|
||||
/// matching [`FrameReceiver`] would otherwise have surfaced
|
||||
/// `RecvError::Lagged(n)`.
|
||||
pub fn dropped_frames(&self, consumer: ConsumerId) -> u64 {
|
||||
self.publisher.stats().drops_for(consumer)
|
||||
}
|
||||
|
||||
/// Total publish attempts since the publisher was constructed.
|
||||
/// Increments on every decoded frame even when there are zero
|
||||
/// subscribers — the metric is the publish *rate*, not the
|
||||
/// delivered-frame rate. Use [`Self::dropped_frames`] for the
|
||||
/// delivered-vs-published delta per consumer.
|
||||
pub fn publishes_total(&self) -> u64 {
|
||||
self.publisher.stats().publishes_total()
|
||||
}
|
||||
|
||||
/// `bringCameraDown`/`bringCameraUp` per `description.md §2`. When
|
||||
@@ -467,4 +534,23 @@ mod tests {
|
||||
handle.set_ai_lock(false);
|
||||
assert!(!handle.ai_locked());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn handle_exposes_publisher_metrics_before_run() {
|
||||
// Arrange
|
||||
let ingest = FrameIngest::new(4);
|
||||
let handle = ingest.handle();
|
||||
|
||||
// Assert — fresh publisher exposes zero metrics for every
|
||||
// known consumer (the AZ-659 health surface contract).
|
||||
assert_eq!(handle.publishes_total(), 0);
|
||||
assert_eq!(handle.dropped_frames(ConsumerId::DetectionClient), 0);
|
||||
assert_eq!(handle.dropped_frames(ConsumerId::MovementDetector), 0);
|
||||
assert_eq!(handle.dropped_frames(ConsumerId::Telemetry), 0);
|
||||
assert_eq!(
|
||||
handle.publisher().channel_depth(),
|
||||
4,
|
||||
"channel_capacity from constructor must propagate to the publisher"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,263 @@
|
||||
//! AZ-659 — `FramePublisher` integration tests.
|
||||
//!
|
||||
//! These tests drive the publisher directly (no RTSP / decoder
|
||||
//! involved) so they execute in milliseconds and don't depend on
|
||||
//! libavcodec or NVDEC. The AZ-658 pipeline tests cover the
|
||||
//! lifecycle-loop integration end-to-end.
|
||||
//!
|
||||
//! ACs covered here:
|
||||
//! - AC-1 — three consumers consuming at-rate observe every frame and
|
||||
//! drop counters stay at 0.
|
||||
//! - AC-2 — a slow consumer's lag is folded into THAT consumer's
|
||||
//! drop counter while fast consumers continue to receive every
|
||||
//! frame.
|
||||
//! - AC-3 — zero-copy fan-out: every consumer receives the same
|
||||
//! `Arc<Bytes>` (asserted via `Arc::ptr_eq`) so memory does not
|
||||
//! scale with consumer count.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use frame_ingest::{ConsumerId, FramePublisher, DEFAULT_CHANNEL_DEPTH};
|
||||
use shared::models::frame::{Frame, PixelFormat};
|
||||
use tokio::time::{sleep, timeout};
|
||||
|
||||
fn make_frame(seq: u64, pixels: Arc<Bytes>) -> Frame {
|
||||
Frame {
|
||||
seq,
|
||||
capture_ts_monotonic_ns: seq * 1_000_000,
|
||||
decode_ts_monotonic_ns: seq * 1_000_000 + 100,
|
||||
pixels,
|
||||
width: 320,
|
||||
height: 240,
|
||||
pix_fmt: PixelFormat::Nv12,
|
||||
ai_locked: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// AC-1 — three consumers consuming as fast as the publisher emits
|
||||
/// observe every frame; per-consumer drop counters stay at 0. The
|
||||
/// spec quotes 30 fps for 10 s (~300 frames); we use 30 frames at
|
||||
/// no artificial delay to keep CI under 1 s. The semantic property
|
||||
/// — "consumers that keep up never lose a frame" — is identical.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn ac1_three_consumers_at_rate_lose_no_frames() {
|
||||
// Arrange
|
||||
let publisher = Arc::new(FramePublisher::new(DEFAULT_CHANNEL_DEPTH));
|
||||
let stats = publisher.stats();
|
||||
let mut det = publisher.subscribe(ConsumerId::DetectionClient);
|
||||
let mut mov = publisher.subscribe(ConsumerId::MovementDetector);
|
||||
let mut tel = publisher.subscribe(ConsumerId::Telemetry);
|
||||
|
||||
let total: u64 = 30;
|
||||
let publisher_for_task = Arc::clone(&publisher);
|
||||
|
||||
// Act — drain in parallel while publishing. Each consumer drains
|
||||
// immediately, so the broadcast channel stays well under
|
||||
// `DEFAULT_CHANNEL_DEPTH` and no consumer can lag.
|
||||
let producer = tokio::spawn(async move {
|
||||
let payload = Arc::new(Bytes::from(vec![0xAAu8; 256]));
|
||||
for seq in 0..total {
|
||||
publisher_for_task.publish(make_frame(seq, Arc::clone(&payload)));
|
||||
// Yield so subscribers get a chance to drain between
|
||||
// sends; without this the producer races ahead and any
|
||||
// delay in tokio scheduling could falsely trip the lag
|
||||
// counter even for a "fast" consumer at this small scale.
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
});
|
||||
|
||||
let drain = |mut rx: frame_ingest::FrameReceiver, label: &'static str| {
|
||||
tokio::spawn(async move {
|
||||
let mut got = 0u64;
|
||||
while got < total {
|
||||
match timeout(Duration::from_secs(2), rx.recv()).await {
|
||||
Ok(Ok(_)) => got += 1,
|
||||
Ok(Err(e)) => panic!("{label} recv closed early: {e}"),
|
||||
Err(_) => panic!("{label} stalled at {got}/{total}"),
|
||||
}
|
||||
}
|
||||
got
|
||||
})
|
||||
};
|
||||
|
||||
let h_det = drain(det.take(), "detection_client");
|
||||
let h_mov = drain(mov.take(), "movement_detector");
|
||||
let h_tel = drain(tel.take(), "telemetry");
|
||||
|
||||
producer.await.expect("producer");
|
||||
assert_eq!(h_det.await.expect("det join"), total);
|
||||
assert_eq!(h_mov.await.expect("mov join"), total);
|
||||
assert_eq!(h_tel.await.expect("tel join"), total);
|
||||
|
||||
// Assert — every consumer drained at-rate, so no drops on any
|
||||
// counter and `publishes_total` matches the produced count.
|
||||
assert_eq!(stats.publishes_total(), total);
|
||||
assert_eq!(stats.drops_for(ConsumerId::DetectionClient), 0);
|
||||
assert_eq!(stats.drops_for(ConsumerId::MovementDetector), 0);
|
||||
assert_eq!(stats.drops_for(ConsumerId::Telemetry), 0);
|
||||
}
|
||||
|
||||
/// AC-2 — a slow consumer (yields slowly) is the only one to incur
|
||||
/// drops; the fast consumers continue to observe every frame. The
|
||||
/// producer paces its sends at ~5 ms intervals so fast consumers
|
||||
/// can drain in between; the slow consumer sleeps ~25 ms per frame,
|
||||
/// so the broadcast channel laps it after a handful of frames.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn ac2_slow_consumer_drops_while_fast_consumers_unaffected() {
|
||||
// Arrange — depth-2 channel + a producer that paces sends.
|
||||
let channel_depth = 2usize;
|
||||
let publisher = Arc::new(FramePublisher::new(channel_depth));
|
||||
let stats = publisher.stats();
|
||||
|
||||
let mut det = publisher.subscribe(ConsumerId::DetectionClient); // fast
|
||||
let mut mov = publisher.subscribe(ConsumerId::MovementDetector); // fast
|
||||
let mut tel = publisher.subscribe(ConsumerId::Telemetry); // SLOW
|
||||
|
||||
let total: u64 = 30;
|
||||
let payload = Arc::new(Bytes::from(vec![0xBBu8; 64]));
|
||||
|
||||
// Spawn consumers BEFORE the producer task so the broadcast
|
||||
// already has live subscribers when the first publish lands.
|
||||
let slow = tokio::spawn(async move {
|
||||
let mut got = 0u64;
|
||||
let deadline = Duration::from_secs(10);
|
||||
let start = tokio::time::Instant::now();
|
||||
// The slow consumer keeps polling until the broadcast
|
||||
// channel closes (publisher drops) OR the safety deadline
|
||||
// fires. A `Closed` here is the natural termination signal
|
||||
// once the producer's `Arc<FramePublisher>` goes out of
|
||||
// scope; we don't try to predict how many frames it gets
|
||||
// because that depends on scheduling jitter.
|
||||
while start.elapsed() < deadline {
|
||||
match timeout(Duration::from_millis(500), tel.recv()).await {
|
||||
Ok(Ok(_)) => {
|
||||
got += 1;
|
||||
sleep(Duration::from_millis(25)).await;
|
||||
}
|
||||
Ok(Err(_)) => break, // Closed: producer finished.
|
||||
Err(_) => {
|
||||
// Timeout — assume producer is done and exit.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
got
|
||||
});
|
||||
|
||||
let drain_fast = |mut rx: frame_ingest::FrameReceiver, label: &'static str| {
|
||||
tokio::spawn(async move {
|
||||
let mut got = 0u64;
|
||||
while got < total {
|
||||
match timeout(Duration::from_secs(3), rx.recv()).await {
|
||||
Ok(Ok(_)) => got += 1,
|
||||
Ok(Err(e)) => panic!("{label} recv closed early: {e}"),
|
||||
Err(_) => panic!("{label} stalled at {got}/{total}"),
|
||||
}
|
||||
}
|
||||
got
|
||||
})
|
||||
};
|
||||
let h_det = drain_fast(det.take(), "detection_client");
|
||||
let h_mov = drain_fast(mov.take(), "movement_detector");
|
||||
|
||||
// Give consumers a moment to enter `recv` before producing.
|
||||
sleep(Duration::from_millis(10)).await;
|
||||
|
||||
// Act — pace sends ~5 ms apart so fast consumers have time to
|
||||
// drain each frame before the next arrives. The slow consumer
|
||||
// can only process ~1 frame per 25 ms, so it inevitably lags.
|
||||
let publisher_for_task = Arc::clone(&publisher);
|
||||
let payload_for_task = Arc::clone(&payload);
|
||||
let producer = tokio::spawn(async move {
|
||||
for seq in 0..total {
|
||||
publisher_for_task.publish(make_frame(seq, Arc::clone(&payload_for_task)));
|
||||
sleep(Duration::from_millis(5)).await;
|
||||
}
|
||||
});
|
||||
|
||||
producer.await.expect("producer");
|
||||
assert_eq!(h_det.await.expect("det join"), total);
|
||||
assert_eq!(h_mov.await.expect("mov join"), total);
|
||||
|
||||
// Drop the last `Arc<FramePublisher>` so the slow consumer's
|
||||
// recv returns `Closed` and it can exit on its own.
|
||||
drop(publisher);
|
||||
let slow_got = slow.await.expect("slow join");
|
||||
|
||||
// Assert — the slow consumer dropped frames; the fast ones did
|
||||
// not. The exact drop count varies with scheduler jitter so we
|
||||
// assert "> 0" rather than a specific number.
|
||||
assert_eq!(
|
||||
stats.drops_for(ConsumerId::DetectionClient),
|
||||
0,
|
||||
"fast consumer must not have any drops"
|
||||
);
|
||||
assert_eq!(
|
||||
stats.drops_for(ConsumerId::MovementDetector),
|
||||
0,
|
||||
"fast consumer must not have any drops"
|
||||
);
|
||||
let tel_drops = stats.drops_for(ConsumerId::Telemetry);
|
||||
assert!(
|
||||
tel_drops > 0,
|
||||
"slow telemetry consumer must have at least one drop; got {tel_drops}"
|
||||
);
|
||||
// Every frame is accounted for from the slow consumer's
|
||||
// perspective: delivered + dropped == published.
|
||||
assert_eq!(
|
||||
slow_got + tel_drops,
|
||||
stats.publishes_total(),
|
||||
"received + dropped must equal published for the slow consumer"
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-3 — fan-out is zero-copy: each subscriber observes the SAME
|
||||
/// `Arc<Bytes>` for a given frame. Asserts the property via
|
||||
/// `Arc::ptr_eq` between the pixel handles delivered to two
|
||||
/// different consumers; the test does not depend on timing.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
async fn ac3_fan_out_is_zero_copy_via_arc_bytes() {
|
||||
// Arrange
|
||||
let publisher = Arc::new(FramePublisher::new(DEFAULT_CHANNEL_DEPTH));
|
||||
let mut det = publisher.subscribe(ConsumerId::DetectionClient);
|
||||
let mut mov = publisher.subscribe(ConsumerId::MovementDetector);
|
||||
let mut tel = publisher.subscribe(ConsumerId::Telemetry);
|
||||
let payload = Arc::new(Bytes::from(vec![0xCDu8; 1024]));
|
||||
|
||||
// Act
|
||||
publisher.publish(make_frame(42, Arc::clone(&payload)));
|
||||
let f_det = det.recv().await.expect("det recv");
|
||||
let f_mov = mov.recv().await.expect("mov recv");
|
||||
let f_tel = tel.recv().await.expect("tel recv");
|
||||
|
||||
// Assert — same Arc across consumers AND across publisher
|
||||
// boundary; the broadcast did not deep-clone Bytes anywhere.
|
||||
assert!(Arc::ptr_eq(&f_det.pixels, &payload));
|
||||
assert!(Arc::ptr_eq(&f_mov.pixels, &payload));
|
||||
assert!(Arc::ptr_eq(&f_tel.pixels, &payload));
|
||||
assert!(Arc::ptr_eq(&f_det.pixels, &f_mov.pixels));
|
||||
assert!(Arc::ptr_eq(&f_mov.pixels, &f_tel.pixels));
|
||||
}
|
||||
|
||||
// `FrameReceiver` does not implement `Copy` and the public surface
|
||||
// returns it by value, so we move it into the spawned task via
|
||||
// `take()` on a small helper. Defined here to keep test bodies tidy.
|
||||
trait Takeable {
|
||||
fn take(&mut self) -> frame_ingest::FrameReceiver;
|
||||
}
|
||||
|
||||
impl Takeable for frame_ingest::FrameReceiver {
|
||||
fn take(&mut self) -> frame_ingest::FrameReceiver {
|
||||
// SAFETY: we replace `self` with a fresh detached receiver
|
||||
// that the test no longer uses; this lets us move ownership
|
||||
// out of a `&mut`-bound binding without unsafe code.
|
||||
std::mem::replace(self, dummy_receiver())
|
||||
}
|
||||
}
|
||||
|
||||
fn dummy_receiver() -> frame_ingest::FrameReceiver {
|
||||
let p = FramePublisher::new(1);
|
||||
p.subscribe(ConsumerId::DetectionClient)
|
||||
}
|
||||
Reference in New Issue
Block a user