//! AZ-675 — multi-topic, per-client lossy publisher. //! //! The publisher owns one `tokio::sync::broadcast` channel per topic. //! Each subscribed client gets per-topic receivers; falling behind //! more than the channel capacity causes `tokio::sync::broadcast` to //! return `RecvError::Lagged(n)` which the server-side stream //! handler turns into a `drops_total{client_id, topic} += n` //! increment. Slow clients never block the publisher, and a slow //! client on one topic does not affect any other client or topic. use std::collections::HashMap; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; use parking_lot::Mutex; use serde::Serialize; use tokio::sync::broadcast; use tracing::warn; use crate::internal::proto::{TelemetryMessage, Topic}; /// Per-topic broadcast capacity. A client falling more than this many /// messages behind on a topic experiences drops on that topic. The /// value is chosen empirically: 10 Hz telemetry → 256 messages ≈ 25 /// seconds of buffering, which is more than enough for transient /// modem stalls but bounded for memory. pub const DEFAULT_TOPIC_CAPACITY: usize = 256; /// All known topics, in the same order as the proto enum (skipping /// UNSPECIFIED). pub const ALL_TOPICS: &[Topic] = &[ Topic::TelemetrySample, Topic::GimbalState, Topic::DetectionEvent, Topic::MovementCandidate, Topic::MapObjectsBundle, ]; /// Errors returned by [`TelemetryPublisher::publish`]. Publish never /// blocks on slow clients; the only producer-side failure is /// serialization of the payload, which is a programmer error caught /// at compile time everywhere we control. We surface it explicitly /// rather than swallow. #[derive(Debug, thiserror::Error)] pub enum PublishError { #[error("serialize topic={topic:?}: {source}")] Serialize { topic: Topic, #[source] source: serde_json::Error, }, } #[derive(Debug, Clone)] pub struct PerTopicCounters { pub published: u64, pub bytes_out: u64, } /// Snapshot of publisher health for the [`crate::TelemetryStreamHandle::health`] /// surface and the AZ-675 NFR observability hooks. #[derive(Debug, Clone, Default)] pub struct PublisherSnapshot { pub subscribed_clients: usize, pub published_total: u64, pub per_topic: HashMap, pub drops_total: HashMap<(String, Topic), u64>, } struct TopicChannel { tx: broadcast::Sender, seq: AtomicU64, published: AtomicU64, bytes_out: AtomicU64, } impl TopicChannel { fn new(capacity: usize) -> Self { let (tx, _) = broadcast::channel(capacity); Self { tx, seq: AtomicU64::new(0), published: AtomicU64::new(0), bytes_out: AtomicU64::new(0), } } } /// Drop counter map. Keyed by `(client_id, topic)`. We isolate it /// behind a `parking_lot::Mutex` because writes happen from /// per-client tonic stream tasks; reads happen from the health /// surface — both are infrequent compared to the broadcast hot path. type DropMap = Mutex>; pub struct TelemetryPublisher { topics: HashMap, drops: DropMap, subscribed_clients: AtomicUsize, } impl TelemetryPublisher { /// Build a publisher with the same per-topic capacity for every /// topic. Use [`new_with_capacities`] if a single topic needs a /// different buffer (e.g. MapObjectsBundle which is bursty). pub fn new(capacity: usize) -> Arc { let mut topics = HashMap::with_capacity(ALL_TOPICS.len()); for &t in ALL_TOPICS { topics.insert(t, TopicChannel::new(capacity)); } Arc::new(Self { topics, drops: Mutex::new(HashMap::new()), subscribed_clients: AtomicUsize::new(0), }) } pub fn default_capacity() -> Arc { Self::new(DEFAULT_TOPIC_CAPACITY) } /// Serialise `payload` and fan out to every subscriber on `topic`. /// /// Never blocks. `broadcast::Sender::send` returns the number of /// receivers it queued for; a return of 0 means no current /// subscribers, which is fine — we still bump the published /// counter so the operator can confirm the producer is alive. pub fn publish(&self, topic: Topic, payload: &T) -> Result<(), PublishError> { let channel = match self.topics.get(&topic) { Some(c) => c, None => { warn!(?topic, "unknown topic; dropping publish"); return Ok(()); } }; let payload_json = serde_json::to_vec(payload) .map_err(|e| PublishError::Serialize { topic, source: e })?; let bytes_len = payload_json.len() as u64; let seq = channel.seq.fetch_add(1, Ordering::Relaxed) + 1; let msg = TelemetryMessage { topic: topic as i32, monotonic_ts_ns: shared::clock::MonoClock::new().elapsed_ns(), sequence: seq, payload_json, }; let _ = channel.tx.send(msg); channel.published.fetch_add(1, Ordering::Relaxed); channel.bytes_out.fetch_add(bytes_len, Ordering::Relaxed); Ok(()) } /// Open a per-client receiver for `topic`. The caller (the /// `Subscribe` RPC handler) is responsible for handling /// `RecvError::Lagged` by calling [`record_drops`]. pub(crate) fn subscribe_topic( &self, topic: Topic, ) -> Option> { self.topics.get(&topic).map(|c| c.tx.subscribe()) } pub(crate) fn register_client(&self) { self.subscribed_clients.fetch_add(1, Ordering::Relaxed); } pub(crate) fn deregister_client(&self) { self.subscribed_clients.fetch_sub(1, Ordering::Relaxed); } pub fn record_drops(&self, client_id: &str, topic: Topic, n: u64) { if n == 0 { return; } let mut map = self.drops.lock(); let key = (client_id.to_string(), topic); map.entry(key) .or_insert_with(|| AtomicU64::new(0)) .fetch_add(n, Ordering::Relaxed); } pub fn snapshot(&self) -> PublisherSnapshot { let mut per_topic = HashMap::with_capacity(self.topics.len()); let mut total_published = 0u64; for (&t, c) in &self.topics { let published = c.published.load(Ordering::Relaxed); let bytes_out = c.bytes_out.load(Ordering::Relaxed); total_published = total_published.saturating_add(published); per_topic.insert( t, PerTopicCounters { published, bytes_out, }, ); } let drops_map = self.drops.lock(); let drops_total: HashMap<(String, Topic), u64> = drops_map .iter() .map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed))) .collect(); PublisherSnapshot { subscribed_clients: self.subscribed_clients.load(Ordering::Relaxed), published_total: total_published, per_topic, drops_total, } } } #[cfg(test)] mod tests { use super::*; use serde::Deserialize; #[derive(Serialize, Deserialize, Debug, PartialEq)] struct Sample { v: u32, } #[test] fn publish_with_no_subscribers_is_no_op() { // Arrange let pub_ = TelemetryPublisher::new(8); // Act pub_.publish(Topic::TelemetrySample, &Sample { v: 1 }) .unwrap(); // Assert let snap = pub_.snapshot(); assert_eq!(snap.subscribed_clients, 0); assert_eq!(snap.per_topic[&Topic::TelemetrySample].published, 1); } #[tokio::test] async fn slow_subscriber_lags_fast_subscriber_does_not() { // Arrange — capacity 4 so 100 publishes overflow trivially. let pub_ = TelemetryPublisher::new(4); let mut fast = pub_.subscribe_topic(Topic::TelemetrySample).unwrap(); let mut slow = pub_.subscribe_topic(Topic::TelemetrySample).unwrap(); // Act — burst 100 messages WITHOUT giving `slow` a chance to drain. for v in 0..100u32 { pub_.publish(Topic::TelemetrySample, &Sample { v }).unwrap(); // fast drains immediately; slow does nothing. let _ = fast.recv().await.unwrap(); } // Drain `slow` and count lag. let mut total_drops: u64 = 0; let mut delivered: u64 = 0; loop { match slow.try_recv() { Ok(_) => delivered += 1, Err(broadcast::error::TryRecvError::Lagged(n)) => total_drops += n, Err(_) => break, } } // Assert — fast got all 100; slow lost most but delivered ≤ capacity. assert!(delivered <= 4, "slow can hold at most capacity (4)"); assert_eq!( total_drops + delivered, 100, "every published msg accounted for" ); assert!(total_drops > 0, "slow subscriber MUST have lagged"); } #[test] fn record_drops_accumulates_per_client_topic() { // Arrange let pub_ = TelemetryPublisher::new(8); // Act pub_.record_drops("client_a", Topic::TelemetrySample, 5); pub_.record_drops("client_a", Topic::TelemetrySample, 3); pub_.record_drops("client_a", Topic::GimbalState, 2); pub_.record_drops("client_b", Topic::TelemetrySample, 1); // Assert let snap = pub_.snapshot(); assert_eq!( snap.drops_total[&("client_a".to_string(), Topic::TelemetrySample)], 8 ); assert_eq!( snap.drops_total[&("client_a".to_string(), Topic::GimbalState)], 2 ); assert_eq!( snap.drops_total[&("client_b".to_string(), Topic::TelemetrySample)], 1 ); } #[test] fn register_deregister_balance_tracks_subscribed_clients() { // Arrange let pub_ = TelemetryPublisher::new(8); assert_eq!(pub_.snapshot().subscribed_clients, 0); // Act pub_.register_client(); pub_.register_client(); assert_eq!(pub_.snapshot().subscribed_clients, 2); pub_.deregister_client(); // Assert assert_eq!(pub_.snapshot().subscribed_clients, 1); } }