mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-22 17:41:10 +00:00
[AZ-675] telemetry_stream Tonic gRPC server + per-client lossy queue
ci/woodpecker/push/build-arm Pipeline failed
ci/woodpecker/push/build-arm Pipeline failed
Pins operator-link transport to gRPC server-streaming (closes architecture Q2 in favour of gRPC). Adds first-time tonic / prost / tonic-build infrastructure to the workspace; uses protoc-bin-vendored so neither dev machines nor CI need system protoc installed. Design — back-pressure lives in the per-topic tokio::sync::broadcast ring, drained directly by the tonic-streamed response via BroadcastStream + StreamMap. No intermediate mpsc buffer that could absorb back-pressure invisibly. Slow client overrun -> Lagged(n) event -> per-(client_id, topic) drop counter incremented; healthy clients on the same topic are unaffected. Service surface — Subscribe(SubscribeRequest) -> stream TelemetryMessage; five topics (TelemetrySample, GimbalState, DetectionEvent, MovementCandidate, MapObjectsBundle); empty topics list defaults to subscribe-all; empty client_id rejected; stream drop decrements subscribed_clients via StreamGuard. TelemetrySink push_detections is now real; push_frame still NotImplemented(AZ-676 video path). Tests — 6 unit + 5 integration (AC-1..AC-3 via in-process gRPC client, plus subscribe-all default + empty-client_id rejection). Clippy on telemetry_stream clean. Pre-existing mission_executor ac3 test polling race surfaces more reliably under the new tonic build pressure; documented as _docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md and unchanged by this batch. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,314 @@
|
||||
//! AZ-675 — multi-topic, per-client lossy publisher.
|
||||
//!
|
||||
//! The publisher owns one `tokio::sync::broadcast` channel per topic.
|
||||
//! Each subscribed client gets per-topic receivers; falling behind
|
||||
//! more than the channel capacity causes `tokio::sync::broadcast` to
|
||||
//! return `RecvError::Lagged(n)` which the server-side stream
|
||||
//! handler turns into a `drops_total{client_id, topic} += n`
|
||||
//! increment. Slow clients never block the publisher, and a slow
|
||||
//! client on one topic does not affect any other client or topic.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use parking_lot::Mutex;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::broadcast;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::internal::proto::{TelemetryMessage, Topic};
|
||||
|
||||
/// Per-topic broadcast capacity. A client falling more than this many
|
||||
/// messages behind on a topic experiences drops on that topic. The
|
||||
/// value is chosen empirically: 10 Hz telemetry → 256 messages ≈ 25
|
||||
/// seconds of buffering, which is more than enough for transient
|
||||
/// modem stalls but bounded for memory.
|
||||
pub const DEFAULT_TOPIC_CAPACITY: usize = 256;
|
||||
|
||||
/// All known topics, in the same order as the proto enum (skipping
|
||||
/// UNSPECIFIED).
|
||||
pub const ALL_TOPICS: &[Topic] = &[
|
||||
Topic::TelemetrySample,
|
||||
Topic::GimbalState,
|
||||
Topic::DetectionEvent,
|
||||
Topic::MovementCandidate,
|
||||
Topic::MapObjectsBundle,
|
||||
];
|
||||
|
||||
/// Errors returned by [`TelemetryPublisher::publish`]. Publish never
|
||||
/// blocks on slow clients; the only producer-side failure is
|
||||
/// serialization of the payload, which is a programmer error caught
|
||||
/// at compile time everywhere we control. We surface it explicitly
|
||||
/// rather than swallow.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PublishError {
|
||||
#[error("serialize topic={topic:?}: {source}")]
|
||||
Serialize {
|
||||
topic: Topic,
|
||||
#[source]
|
||||
source: serde_json::Error,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PerTopicCounters {
|
||||
pub published: u64,
|
||||
pub bytes_out: u64,
|
||||
}
|
||||
|
||||
/// Snapshot of publisher health for the [`crate::TelemetryStreamHandle::health`]
|
||||
/// surface and the AZ-675 NFR observability hooks.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublisherSnapshot {
|
||||
pub subscribed_clients: usize,
|
||||
pub published_total: u64,
|
||||
pub per_topic: HashMap<Topic, PerTopicCounters>,
|
||||
pub drops_total: HashMap<(String, Topic), u64>,
|
||||
}
|
||||
|
||||
struct TopicChannel {
|
||||
tx: broadcast::Sender<TelemetryMessage>,
|
||||
seq: AtomicU64,
|
||||
published: AtomicU64,
|
||||
bytes_out: AtomicU64,
|
||||
}
|
||||
|
||||
impl TopicChannel {
|
||||
fn new(capacity: usize) -> Self {
|
||||
let (tx, _) = broadcast::channel(capacity);
|
||||
Self {
|
||||
tx,
|
||||
seq: AtomicU64::new(0),
|
||||
published: AtomicU64::new(0),
|
||||
bytes_out: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop counter map. Keyed by `(client_id, topic)`. We isolate it
|
||||
/// behind a `parking_lot::Mutex` because writes happen from
|
||||
/// per-client tonic stream tasks; reads happen from the health
|
||||
/// surface — both are infrequent compared to the broadcast hot path.
|
||||
type DropMap = Mutex<HashMap<(String, Topic), AtomicU64>>;
|
||||
|
||||
pub struct TelemetryPublisher {
|
||||
topics: HashMap<Topic, TopicChannel>,
|
||||
drops: DropMap,
|
||||
subscribed_clients: AtomicUsize,
|
||||
}
|
||||
|
||||
impl TelemetryPublisher {
|
||||
/// Build a publisher with the same per-topic capacity for every
|
||||
/// topic. Use [`new_with_capacities`] if a single topic needs a
|
||||
/// different buffer (e.g. MapObjectsBundle which is bursty).
|
||||
pub fn new(capacity: usize) -> Arc<Self> {
|
||||
let mut topics = HashMap::with_capacity(ALL_TOPICS.len());
|
||||
for &t in ALL_TOPICS {
|
||||
topics.insert(t, TopicChannel::new(capacity));
|
||||
}
|
||||
Arc::new(Self {
|
||||
topics,
|
||||
drops: Mutex::new(HashMap::new()),
|
||||
subscribed_clients: AtomicUsize::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn default_capacity() -> Arc<Self> {
|
||||
Self::new(DEFAULT_TOPIC_CAPACITY)
|
||||
}
|
||||
|
||||
/// Serialise `payload` and fan out to every subscriber on `topic`.
|
||||
///
|
||||
/// Never blocks. `broadcast::Sender::send` returns the number of
|
||||
/// receivers it queued for; a return of 0 means no current
|
||||
/// subscribers, which is fine — we still bump the published
|
||||
/// counter so the operator can confirm the producer is alive.
|
||||
pub fn publish<T: Serialize>(&self, topic: Topic, payload: &T) -> Result<(), PublishError> {
|
||||
let channel = match self.topics.get(&topic) {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
warn!(?topic, "unknown topic; dropping publish");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let payload_json = serde_json::to_vec(payload)
|
||||
.map_err(|e| PublishError::Serialize { topic, source: e })?;
|
||||
let bytes_len = payload_json.len() as u64;
|
||||
|
||||
let seq = channel.seq.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
let msg = TelemetryMessage {
|
||||
topic: topic as i32,
|
||||
monotonic_ts_ns: shared::clock::MonoClock::new().elapsed_ns(),
|
||||
sequence: seq,
|
||||
payload_json,
|
||||
};
|
||||
|
||||
let _ = channel.tx.send(msg);
|
||||
channel.published.fetch_add(1, Ordering::Relaxed);
|
||||
channel.bytes_out.fetch_add(bytes_len, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Open a per-client receiver for `topic`. The caller (the
|
||||
/// `Subscribe` RPC handler) is responsible for handling
|
||||
/// `RecvError::Lagged` by calling [`record_drops`].
|
||||
pub(crate) fn subscribe_topic(
|
||||
&self,
|
||||
topic: Topic,
|
||||
) -> Option<broadcast::Receiver<TelemetryMessage>> {
|
||||
self.topics.get(&topic).map(|c| c.tx.subscribe())
|
||||
}
|
||||
|
||||
pub(crate) fn register_client(&self) {
|
||||
self.subscribed_clients.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub(crate) fn deregister_client(&self) {
|
||||
self.subscribed_clients.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn record_drops(&self, client_id: &str, topic: Topic, n: u64) {
|
||||
if n == 0 {
|
||||
return;
|
||||
}
|
||||
let mut map = self.drops.lock();
|
||||
let key = (client_id.to_string(), topic);
|
||||
map.entry(key)
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn snapshot(&self) -> PublisherSnapshot {
|
||||
let mut per_topic = HashMap::with_capacity(self.topics.len());
|
||||
let mut total_published = 0u64;
|
||||
for (&t, c) in &self.topics {
|
||||
let published = c.published.load(Ordering::Relaxed);
|
||||
let bytes_out = c.bytes_out.load(Ordering::Relaxed);
|
||||
total_published = total_published.saturating_add(published);
|
||||
per_topic.insert(
|
||||
t,
|
||||
PerTopicCounters {
|
||||
published,
|
||||
bytes_out,
|
||||
},
|
||||
);
|
||||
}
|
||||
let drops_map = self.drops.lock();
|
||||
let drops_total: HashMap<(String, Topic), u64> = drops_map
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed)))
|
||||
.collect();
|
||||
PublisherSnapshot {
|
||||
subscribed_clients: self.subscribed_clients.load(Ordering::Relaxed),
|
||||
published_total: total_published,
|
||||
per_topic,
|
||||
drops_total,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq)]
|
||||
struct Sample {
|
||||
v: u32,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn publish_with_no_subscribers_is_no_op() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
|
||||
// Act
|
||||
pub_.publish(Topic::TelemetrySample, &Sample { v: 1 })
|
||||
.unwrap();
|
||||
|
||||
// Assert
|
||||
let snap = pub_.snapshot();
|
||||
assert_eq!(snap.subscribed_clients, 0);
|
||||
assert_eq!(snap.per_topic[&Topic::TelemetrySample].published, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn slow_subscriber_lags_fast_subscriber_does_not() {
|
||||
// Arrange — capacity 4 so 100 publishes overflow trivially.
|
||||
let pub_ = TelemetryPublisher::new(4);
|
||||
let mut fast = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
||||
let mut slow = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
||||
|
||||
// Act — burst 100 messages WITHOUT giving `slow` a chance to drain.
|
||||
for v in 0..100u32 {
|
||||
pub_.publish(Topic::TelemetrySample, &Sample { v }).unwrap();
|
||||
// fast drains immediately; slow does nothing.
|
||||
let _ = fast.recv().await.unwrap();
|
||||
}
|
||||
|
||||
// Drain `slow` and count lag.
|
||||
let mut total_drops: u64 = 0;
|
||||
let mut delivered: u64 = 0;
|
||||
loop {
|
||||
match slow.try_recv() {
|
||||
Ok(_) => delivered += 1,
|
||||
Err(broadcast::error::TryRecvError::Lagged(n)) => total_drops += n,
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
|
||||
// Assert — fast got all 100; slow lost most but delivered ≤ capacity.
|
||||
assert!(delivered <= 4, "slow can hold at most capacity (4)");
|
||||
assert_eq!(
|
||||
total_drops + delivered,
|
||||
100,
|
||||
"every published msg accounted for"
|
||||
);
|
||||
assert!(total_drops > 0, "slow subscriber MUST have lagged");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_drops_accumulates_per_client_topic() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
|
||||
// Act
|
||||
pub_.record_drops("client_a", Topic::TelemetrySample, 5);
|
||||
pub_.record_drops("client_a", Topic::TelemetrySample, 3);
|
||||
pub_.record_drops("client_a", Topic::GimbalState, 2);
|
||||
pub_.record_drops("client_b", Topic::TelemetrySample, 1);
|
||||
|
||||
// Assert
|
||||
let snap = pub_.snapshot();
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_a".to_string(), Topic::TelemetrySample)],
|
||||
8
|
||||
);
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_a".to_string(), Topic::GimbalState)],
|
||||
2
|
||||
);
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_b".to_string(), Topic::TelemetrySample)],
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn register_deregister_balance_tracks_subscribed_clients() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 0);
|
||||
|
||||
// Act
|
||||
pub_.register_client();
|
||||
pub_.register_client();
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 2);
|
||||
pub_.deregister_client();
|
||||
|
||||
// Assert
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user