mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-22 01:51:10 +00:00
ff790bd639
ci/woodpecker/push/build-arm Pipeline failed
Pins operator-link transport to gRPC server-streaming (closes architecture Q2 in favour of gRPC). Adds first-time tonic / prost / tonic-build infrastructure to the workspace; uses protoc-bin-vendored so neither dev machines nor CI need system protoc installed. Design — back-pressure lives in the per-topic tokio::sync::broadcast ring, drained directly by the tonic-streamed response via BroadcastStream + StreamMap. No intermediate mpsc buffer that could absorb back-pressure invisibly. Slow client overrun -> Lagged(n) event -> per-(client_id, topic) drop counter incremented; healthy clients on the same topic are unaffected. Service surface — Subscribe(SubscribeRequest) -> stream TelemetryMessage; five topics (TelemetrySample, GimbalState, DetectionEvent, MovementCandidate, MapObjectsBundle); empty topics list defaults to subscribe-all; empty client_id rejected; stream drop decrements subscribed_clients via StreamGuard. TelemetrySink push_detections is now real; push_frame still NotImplemented(AZ-676 video path). Tests — 6 unit + 5 integration (AC-1..AC-3 via in-process gRPC client, plus subscribe-all default + empty-client_id rejection). Clippy on telemetry_stream clean. Pre-existing mission_executor ac3 test polling race surfaces more reliably under the new tonic build pressure; documented as _docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md and unchanged by this batch. Co-authored-by: Cursor <cursoragent@cursor.com>
315 lines
10 KiB
Rust
315 lines
10 KiB
Rust
//! AZ-675 — multi-topic, per-client lossy publisher.
|
|
//!
|
|
//! The publisher owns one `tokio::sync::broadcast` channel per topic.
|
|
//! Each subscribed client gets per-topic receivers; falling behind
|
|
//! more than the channel capacity causes `tokio::sync::broadcast` to
|
|
//! return `RecvError::Lagged(n)` which the server-side stream
|
|
//! handler turns into a `drops_total{client_id, topic} += n`
|
|
//! increment. Slow clients never block the publisher, and a slow
|
|
//! client on one topic does not affect any other client or topic.
|
|
|
|
use std::collections::HashMap;
|
|
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
|
use std::sync::Arc;
|
|
|
|
use parking_lot::Mutex;
|
|
use serde::Serialize;
|
|
use tokio::sync::broadcast;
|
|
use tracing::warn;
|
|
|
|
use crate::internal::proto::{TelemetryMessage, Topic};
|
|
|
|
/// Per-topic broadcast capacity. A client falling more than this many
|
|
/// messages behind on a topic experiences drops on that topic. The
|
|
/// value is chosen empirically: 10 Hz telemetry → 256 messages ≈ 25
|
|
/// seconds of buffering, which is more than enough for transient
|
|
/// modem stalls but bounded for memory.
|
|
pub const DEFAULT_TOPIC_CAPACITY: usize = 256;
|
|
|
|
/// All known topics, in the same order as the proto enum (skipping
|
|
/// UNSPECIFIED).
|
|
pub const ALL_TOPICS: &[Topic] = &[
|
|
Topic::TelemetrySample,
|
|
Topic::GimbalState,
|
|
Topic::DetectionEvent,
|
|
Topic::MovementCandidate,
|
|
Topic::MapObjectsBundle,
|
|
];
|
|
|
|
/// Errors returned by [`TelemetryPublisher::publish`]. Publish never
|
|
/// blocks on slow clients; the only producer-side failure is
|
|
/// serialization of the payload, which is a programmer error caught
|
|
/// at compile time everywhere we control. We surface it explicitly
|
|
/// rather than swallow.
|
|
#[derive(Debug, thiserror::Error)]
|
|
pub enum PublishError {
|
|
#[error("serialize topic={topic:?}: {source}")]
|
|
Serialize {
|
|
topic: Topic,
|
|
#[source]
|
|
source: serde_json::Error,
|
|
},
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct PerTopicCounters {
|
|
pub published: u64,
|
|
pub bytes_out: u64,
|
|
}
|
|
|
|
/// Snapshot of publisher health for the [`crate::TelemetryStreamHandle::health`]
|
|
/// surface and the AZ-675 NFR observability hooks.
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct PublisherSnapshot {
|
|
pub subscribed_clients: usize,
|
|
pub published_total: u64,
|
|
pub per_topic: HashMap<Topic, PerTopicCounters>,
|
|
pub drops_total: HashMap<(String, Topic), u64>,
|
|
}
|
|
|
|
struct TopicChannel {
|
|
tx: broadcast::Sender<TelemetryMessage>,
|
|
seq: AtomicU64,
|
|
published: AtomicU64,
|
|
bytes_out: AtomicU64,
|
|
}
|
|
|
|
impl TopicChannel {
|
|
fn new(capacity: usize) -> Self {
|
|
let (tx, _) = broadcast::channel(capacity);
|
|
Self {
|
|
tx,
|
|
seq: AtomicU64::new(0),
|
|
published: AtomicU64::new(0),
|
|
bytes_out: AtomicU64::new(0),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Drop counter map. Keyed by `(client_id, topic)`. We isolate it
|
|
/// behind a `parking_lot::Mutex` because writes happen from
|
|
/// per-client tonic stream tasks; reads happen from the health
|
|
/// surface — both are infrequent compared to the broadcast hot path.
|
|
type DropMap = Mutex<HashMap<(String, Topic), AtomicU64>>;
|
|
|
|
pub struct TelemetryPublisher {
|
|
topics: HashMap<Topic, TopicChannel>,
|
|
drops: DropMap,
|
|
subscribed_clients: AtomicUsize,
|
|
}
|
|
|
|
impl TelemetryPublisher {
|
|
/// Build a publisher with the same per-topic capacity for every
|
|
/// topic. Use [`new_with_capacities`] if a single topic needs a
|
|
/// different buffer (e.g. MapObjectsBundle which is bursty).
|
|
pub fn new(capacity: usize) -> Arc<Self> {
|
|
let mut topics = HashMap::with_capacity(ALL_TOPICS.len());
|
|
for &t in ALL_TOPICS {
|
|
topics.insert(t, TopicChannel::new(capacity));
|
|
}
|
|
Arc::new(Self {
|
|
topics,
|
|
drops: Mutex::new(HashMap::new()),
|
|
subscribed_clients: AtomicUsize::new(0),
|
|
})
|
|
}
|
|
|
|
pub fn default_capacity() -> Arc<Self> {
|
|
Self::new(DEFAULT_TOPIC_CAPACITY)
|
|
}
|
|
|
|
/// Serialise `payload` and fan out to every subscriber on `topic`.
|
|
///
|
|
/// Never blocks. `broadcast::Sender::send` returns the number of
|
|
/// receivers it queued for; a return of 0 means no current
|
|
/// subscribers, which is fine — we still bump the published
|
|
/// counter so the operator can confirm the producer is alive.
|
|
pub fn publish<T: Serialize>(&self, topic: Topic, payload: &T) -> Result<(), PublishError> {
|
|
let channel = match self.topics.get(&topic) {
|
|
Some(c) => c,
|
|
None => {
|
|
warn!(?topic, "unknown topic; dropping publish");
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
let payload_json = serde_json::to_vec(payload)
|
|
.map_err(|e| PublishError::Serialize { topic, source: e })?;
|
|
let bytes_len = payload_json.len() as u64;
|
|
|
|
let seq = channel.seq.fetch_add(1, Ordering::Relaxed) + 1;
|
|
let msg = TelemetryMessage {
|
|
topic: topic as i32,
|
|
monotonic_ts_ns: shared::clock::MonoClock::new().elapsed_ns(),
|
|
sequence: seq,
|
|
payload_json,
|
|
};
|
|
|
|
let _ = channel.tx.send(msg);
|
|
channel.published.fetch_add(1, Ordering::Relaxed);
|
|
channel.bytes_out.fetch_add(bytes_len, Ordering::Relaxed);
|
|
Ok(())
|
|
}
|
|
|
|
/// Open a per-client receiver for `topic`. The caller (the
|
|
/// `Subscribe` RPC handler) is responsible for handling
|
|
/// `RecvError::Lagged` by calling [`record_drops`].
|
|
pub(crate) fn subscribe_topic(
|
|
&self,
|
|
topic: Topic,
|
|
) -> Option<broadcast::Receiver<TelemetryMessage>> {
|
|
self.topics.get(&topic).map(|c| c.tx.subscribe())
|
|
}
|
|
|
|
pub(crate) fn register_client(&self) {
|
|
self.subscribed_clients.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
|
|
pub(crate) fn deregister_client(&self) {
|
|
self.subscribed_clients.fetch_sub(1, Ordering::Relaxed);
|
|
}
|
|
|
|
pub fn record_drops(&self, client_id: &str, topic: Topic, n: u64) {
|
|
if n == 0 {
|
|
return;
|
|
}
|
|
let mut map = self.drops.lock();
|
|
let key = (client_id.to_string(), topic);
|
|
map.entry(key)
|
|
.or_insert_with(|| AtomicU64::new(0))
|
|
.fetch_add(n, Ordering::Relaxed);
|
|
}
|
|
|
|
pub fn snapshot(&self) -> PublisherSnapshot {
|
|
let mut per_topic = HashMap::with_capacity(self.topics.len());
|
|
let mut total_published = 0u64;
|
|
for (&t, c) in &self.topics {
|
|
let published = c.published.load(Ordering::Relaxed);
|
|
let bytes_out = c.bytes_out.load(Ordering::Relaxed);
|
|
total_published = total_published.saturating_add(published);
|
|
per_topic.insert(
|
|
t,
|
|
PerTopicCounters {
|
|
published,
|
|
bytes_out,
|
|
},
|
|
);
|
|
}
|
|
let drops_map = self.drops.lock();
|
|
let drops_total: HashMap<(String, Topic), u64> = drops_map
|
|
.iter()
|
|
.map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed)))
|
|
.collect();
|
|
PublisherSnapshot {
|
|
subscribed_clients: self.subscribed_clients.load(Ordering::Relaxed),
|
|
published_total: total_published,
|
|
per_topic,
|
|
drops_total,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use serde::Deserialize;
|
|
|
|
#[derive(Serialize, Deserialize, Debug, PartialEq)]
|
|
struct Sample {
|
|
v: u32,
|
|
}
|
|
|
|
#[test]
|
|
fn publish_with_no_subscribers_is_no_op() {
|
|
// Arrange
|
|
let pub_ = TelemetryPublisher::new(8);
|
|
|
|
// Act
|
|
pub_.publish(Topic::TelemetrySample, &Sample { v: 1 })
|
|
.unwrap();
|
|
|
|
// Assert
|
|
let snap = pub_.snapshot();
|
|
assert_eq!(snap.subscribed_clients, 0);
|
|
assert_eq!(snap.per_topic[&Topic::TelemetrySample].published, 1);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn slow_subscriber_lags_fast_subscriber_does_not() {
|
|
// Arrange — capacity 4 so 100 publishes overflow trivially.
|
|
let pub_ = TelemetryPublisher::new(4);
|
|
let mut fast = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
|
let mut slow = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
|
|
|
// Act — burst 100 messages WITHOUT giving `slow` a chance to drain.
|
|
for v in 0..100u32 {
|
|
pub_.publish(Topic::TelemetrySample, &Sample { v }).unwrap();
|
|
// fast drains immediately; slow does nothing.
|
|
let _ = fast.recv().await.unwrap();
|
|
}
|
|
|
|
// Drain `slow` and count lag.
|
|
let mut total_drops: u64 = 0;
|
|
let mut delivered: u64 = 0;
|
|
loop {
|
|
match slow.try_recv() {
|
|
Ok(_) => delivered += 1,
|
|
Err(broadcast::error::TryRecvError::Lagged(n)) => total_drops += n,
|
|
Err(_) => break,
|
|
}
|
|
}
|
|
|
|
// Assert — fast got all 100; slow lost most but delivered ≤ capacity.
|
|
assert!(delivered <= 4, "slow can hold at most capacity (4)");
|
|
assert_eq!(
|
|
total_drops + delivered,
|
|
100,
|
|
"every published msg accounted for"
|
|
);
|
|
assert!(total_drops > 0, "slow subscriber MUST have lagged");
|
|
}
|
|
|
|
#[test]
|
|
fn record_drops_accumulates_per_client_topic() {
|
|
// Arrange
|
|
let pub_ = TelemetryPublisher::new(8);
|
|
|
|
// Act
|
|
pub_.record_drops("client_a", Topic::TelemetrySample, 5);
|
|
pub_.record_drops("client_a", Topic::TelemetrySample, 3);
|
|
pub_.record_drops("client_a", Topic::GimbalState, 2);
|
|
pub_.record_drops("client_b", Topic::TelemetrySample, 1);
|
|
|
|
// Assert
|
|
let snap = pub_.snapshot();
|
|
assert_eq!(
|
|
snap.drops_total[&("client_a".to_string(), Topic::TelemetrySample)],
|
|
8
|
|
);
|
|
assert_eq!(
|
|
snap.drops_total[&("client_a".to_string(), Topic::GimbalState)],
|
|
2
|
|
);
|
|
assert_eq!(
|
|
snap.drops_total[&("client_b".to_string(), Topic::TelemetrySample)],
|
|
1
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn register_deregister_balance_tracks_subscribed_clients() {
|
|
// Arrange
|
|
let pub_ = TelemetryPublisher::new(8);
|
|
assert_eq!(pub_.snapshot().subscribed_clients, 0);
|
|
|
|
// Act
|
|
pub_.register_client();
|
|
pub_.register_client();
|
|
assert_eq!(pub_.snapshot().subscribed_clients, 2);
|
|
pub_.deregister_client();
|
|
|
|
// Assert
|
|
assert_eq!(pub_.snapshot().subscribed_clients, 1);
|
|
}
|
|
}
|