mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-21 10:31:10 +00:00
[AZ-675] telemetry_stream Tonic gRPC server + per-client lossy queue
ci/woodpecker/push/build-arm Pipeline failed
ci/woodpecker/push/build-arm Pipeline failed
Pins operator-link transport to gRPC server-streaming (closes architecture Q2 in favour of gRPC). Adds first-time tonic / prost / tonic-build infrastructure to the workspace; uses protoc-bin-vendored so neither dev machines nor CI need system protoc installed. Design — back-pressure lives in the per-topic tokio::sync::broadcast ring, drained directly by the tonic-streamed response via BroadcastStream + StreamMap. No intermediate mpsc buffer that could absorb back-pressure invisibly. Slow client overrun -> Lagged(n) event -> per-(client_id, topic) drop counter incremented; healthy clients on the same topic are unaffected. Service surface — Subscribe(SubscribeRequest) -> stream TelemetryMessage; five topics (TelemetrySample, GimbalState, DetectionEvent, MovementCandidate, MapObjectsBundle); empty topics list defaults to subscribe-all; empty client_id rejected; stream drop decrements subscribed_clients via StreamGuard. TelemetrySink push_detections is now real; push_frame still NotImplemented(AZ-676 video path). Tests — 6 unit + 5 integration (AC-1..AC-3 via in-process gRPC client, plus subscribe-all default + empty-client_id rejection). Clippy on telemetry_stream clean. Pre-existing mission_executor ac3 test polling race surfaces more reliably under the new tonic build pressure; documented as _docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md and unchanged by this batch. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -6,9 +6,27 @@ rust-version.workspace = true
|
||||
license.workspace = true
|
||||
publish.workspace = true
|
||||
authors.workspace = true
|
||||
build = "build.rs"
|
||||
|
||||
[dependencies]
|
||||
shared = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
tokio-stream = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
prost = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
tonic-prost = { workspace = true }
|
||||
parking_lot = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-prost-build = { workspace = true }
|
||||
protoc-bin-vendored = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
//! Build-time codegen for the AZ-675 telemetry gRPC contract.
|
||||
//!
|
||||
//! We use `protoc-bin-vendored` so the build is self-contained — no
|
||||
//! system protoc install is required on dev or CI. The PROTOC env
|
||||
//! var is set before invoking `tonic-prost-build`, which is what
|
||||
//! tonic uses to locate the compiler.
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let protoc = protoc_bin_vendored::protoc_bin_path()?;
|
||||
std::env::set_var("PROTOC", protoc);
|
||||
|
||||
tonic_prost_build::configure()
|
||||
.build_client(true)
|
||||
.build_server(true)
|
||||
.compile_protos(&["proto/telemetry.proto"], &["proto"])?;
|
||||
|
||||
println!("cargo:rerun-if-changed=proto/telemetry.proto");
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
// AZ-675 telemetry_stream — operator-bound gRPC contract.
|
||||
//
|
||||
// One service, one bi-directional Subscribe RPC. Client opens a stream
|
||||
// declaring which topics it wants; server pushes messages for those
|
||||
// topics until the client disconnects.
|
||||
//
|
||||
// The server enforces per-client back-pressure: when a client cannot
|
||||
// keep up the oldest message in *that client's* queue is dropped and
|
||||
// a per-(client, topic) drop counter is incremented. Other clients
|
||||
// are unaffected.
|
||||
//
|
||||
// AZ-676 will add the video path (separate RPC, server-streamed binary
|
||||
// frames). AZ-677 will add the MapObjectsBundle snapshot RPC. Keep
|
||||
// those concerns out of this contract.
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package autopilot.telemetry.v1;
|
||||
|
||||
// Topics a client can subscribe to. Repeated in SubscribeRequest so a
|
||||
// single stream multiplexes whichever subset the client cares about.
|
||||
enum Topic {
|
||||
TOPIC_UNSPECIFIED = 0;
|
||||
TOPIC_TELEMETRY_SAMPLE = 1;
|
||||
TOPIC_GIMBAL_STATE = 2;
|
||||
TOPIC_DETECTION_EVENT = 3;
|
||||
TOPIC_MOVEMENT_CANDIDATE = 4;
|
||||
TOPIC_MAP_OBJECTS_BUNDLE = 5;
|
||||
}
|
||||
|
||||
message SubscribeRequest {
|
||||
// Operator/client identifier. Plumbed into per-client drop counters
|
||||
// and log lines. Free-form for AZ-675; AZ-678 (operator command
|
||||
// auth) will tighten the format.
|
||||
string client_id = 1;
|
||||
repeated Topic topics = 2;
|
||||
}
|
||||
|
||||
// Each message carries the topic tag + an opaque JSON payload. We
|
||||
// don't pin schemas per topic in proto here because the canonical
|
||||
// payload shapes are already authoritative in `crates/shared/models`
|
||||
// (TelemetrySample, GimbalState, DetectionEvent, MovementCandidate,
|
||||
// MapObjectsBundle). Using JSON keeps the wire honest with what the
|
||||
// rest of the system already serializes; if a topic later needs
|
||||
// schema enforcement, add a typed message variant in a future bump.
|
||||
message TelemetryMessage {
|
||||
Topic topic = 1;
|
||||
// Monotonic nanoseconds since the autopilot process started. Used
|
||||
// by the operator for ordering and latency measurement.
|
||||
uint64 monotonic_ts_ns = 2;
|
||||
// Server-side per-client sequence number. Strictly increases per
|
||||
// (client, topic) stream; gaps imply drops.
|
||||
uint64 sequence = 3;
|
||||
// Serialized JSON payload (utf-8). Topic determines the shape.
|
||||
bytes payload_json = 4;
|
||||
}
|
||||
|
||||
service TelemetryStream {
|
||||
// Server-streaming subscribe. The client sends ONE SubscribeRequest;
|
||||
// the server pushes TelemetryMessage values until the client cancels
|
||||
// the stream or the server shuts down. The server applies per-
|
||||
// client drop-oldest back-pressure if the client cannot keep up.
|
||||
rpc Subscribe(SubscribeRequest) returns (stream TelemetryMessage);
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//! Internal modules for `telemetry_stream`. Not part of the public API.
|
||||
|
||||
pub mod proto;
|
||||
pub mod publisher;
|
||||
pub mod server;
|
||||
@@ -0,0 +1,10 @@
|
||||
//! Generated tonic+prost code for the telemetry gRPC contract.
|
||||
//!
|
||||
//! The actual `.rs` file is produced at build time by `build.rs`
|
||||
//! (see workspace `tonic-prost-build` / `protoc-bin-vendored` deps)
|
||||
//! and dropped into `OUT_DIR`. We pull it in here under a stable
|
||||
//! module path so the rest of the crate doesn't reach into `OUT_DIR`.
|
||||
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
|
||||
tonic::include_proto!("autopilot.telemetry.v1");
|
||||
@@ -0,0 +1,314 @@
|
||||
//! AZ-675 — multi-topic, per-client lossy publisher.
|
||||
//!
|
||||
//! The publisher owns one `tokio::sync::broadcast` channel per topic.
|
||||
//! Each subscribed client gets per-topic receivers; falling behind
|
||||
//! more than the channel capacity causes `tokio::sync::broadcast` to
|
||||
//! return `RecvError::Lagged(n)` which the server-side stream
|
||||
//! handler turns into a `drops_total{client_id, topic} += n`
|
||||
//! increment. Slow clients never block the publisher, and a slow
|
||||
//! client on one topic does not affect any other client or topic.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use parking_lot::Mutex;
|
||||
use serde::Serialize;
|
||||
use tokio::sync::broadcast;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::internal::proto::{TelemetryMessage, Topic};
|
||||
|
||||
/// Per-topic broadcast capacity. A client falling more than this many
|
||||
/// messages behind on a topic experiences drops on that topic. The
|
||||
/// value is chosen empirically: 10 Hz telemetry → 256 messages ≈ 25
|
||||
/// seconds of buffering, which is more than enough for transient
|
||||
/// modem stalls but bounded for memory.
|
||||
pub const DEFAULT_TOPIC_CAPACITY: usize = 256;
|
||||
|
||||
/// All known topics, in the same order as the proto enum (skipping
|
||||
/// UNSPECIFIED).
|
||||
pub const ALL_TOPICS: &[Topic] = &[
|
||||
Topic::TelemetrySample,
|
||||
Topic::GimbalState,
|
||||
Topic::DetectionEvent,
|
||||
Topic::MovementCandidate,
|
||||
Topic::MapObjectsBundle,
|
||||
];
|
||||
|
||||
/// Errors returned by [`TelemetryPublisher::publish`]. Publish never
|
||||
/// blocks on slow clients; the only producer-side failure is
|
||||
/// serialization of the payload, which is a programmer error caught
|
||||
/// at compile time everywhere we control. We surface it explicitly
|
||||
/// rather than swallow.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum PublishError {
|
||||
#[error("serialize topic={topic:?}: {source}")]
|
||||
Serialize {
|
||||
topic: Topic,
|
||||
#[source]
|
||||
source: serde_json::Error,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PerTopicCounters {
|
||||
pub published: u64,
|
||||
pub bytes_out: u64,
|
||||
}
|
||||
|
||||
/// Snapshot of publisher health for the [`crate::TelemetryStreamHandle::health`]
|
||||
/// surface and the AZ-675 NFR observability hooks.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PublisherSnapshot {
|
||||
pub subscribed_clients: usize,
|
||||
pub published_total: u64,
|
||||
pub per_topic: HashMap<Topic, PerTopicCounters>,
|
||||
pub drops_total: HashMap<(String, Topic), u64>,
|
||||
}
|
||||
|
||||
struct TopicChannel {
|
||||
tx: broadcast::Sender<TelemetryMessage>,
|
||||
seq: AtomicU64,
|
||||
published: AtomicU64,
|
||||
bytes_out: AtomicU64,
|
||||
}
|
||||
|
||||
impl TopicChannel {
|
||||
fn new(capacity: usize) -> Self {
|
||||
let (tx, _) = broadcast::channel(capacity);
|
||||
Self {
|
||||
tx,
|
||||
seq: AtomicU64::new(0),
|
||||
published: AtomicU64::new(0),
|
||||
bytes_out: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop counter map. Keyed by `(client_id, topic)`. We isolate it
|
||||
/// behind a `parking_lot::Mutex` because writes happen from
|
||||
/// per-client tonic stream tasks; reads happen from the health
|
||||
/// surface — both are infrequent compared to the broadcast hot path.
|
||||
type DropMap = Mutex<HashMap<(String, Topic), AtomicU64>>;
|
||||
|
||||
pub struct TelemetryPublisher {
|
||||
topics: HashMap<Topic, TopicChannel>,
|
||||
drops: DropMap,
|
||||
subscribed_clients: AtomicUsize,
|
||||
}
|
||||
|
||||
impl TelemetryPublisher {
|
||||
/// Build a publisher with the same per-topic capacity for every
|
||||
/// topic. Use [`new_with_capacities`] if a single topic needs a
|
||||
/// different buffer (e.g. MapObjectsBundle which is bursty).
|
||||
pub fn new(capacity: usize) -> Arc<Self> {
|
||||
let mut topics = HashMap::with_capacity(ALL_TOPICS.len());
|
||||
for &t in ALL_TOPICS {
|
||||
topics.insert(t, TopicChannel::new(capacity));
|
||||
}
|
||||
Arc::new(Self {
|
||||
topics,
|
||||
drops: Mutex::new(HashMap::new()),
|
||||
subscribed_clients: AtomicUsize::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn default_capacity() -> Arc<Self> {
|
||||
Self::new(DEFAULT_TOPIC_CAPACITY)
|
||||
}
|
||||
|
||||
/// Serialise `payload` and fan out to every subscriber on `topic`.
|
||||
///
|
||||
/// Never blocks. `broadcast::Sender::send` returns the number of
|
||||
/// receivers it queued for; a return of 0 means no current
|
||||
/// subscribers, which is fine — we still bump the published
|
||||
/// counter so the operator can confirm the producer is alive.
|
||||
pub fn publish<T: Serialize>(&self, topic: Topic, payload: &T) -> Result<(), PublishError> {
|
||||
let channel = match self.topics.get(&topic) {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
warn!(?topic, "unknown topic; dropping publish");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let payload_json = serde_json::to_vec(payload)
|
||||
.map_err(|e| PublishError::Serialize { topic, source: e })?;
|
||||
let bytes_len = payload_json.len() as u64;
|
||||
|
||||
let seq = channel.seq.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
let msg = TelemetryMessage {
|
||||
topic: topic as i32,
|
||||
monotonic_ts_ns: shared::clock::MonoClock::new().elapsed_ns(),
|
||||
sequence: seq,
|
||||
payload_json,
|
||||
};
|
||||
|
||||
let _ = channel.tx.send(msg);
|
||||
channel.published.fetch_add(1, Ordering::Relaxed);
|
||||
channel.bytes_out.fetch_add(bytes_len, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Open a per-client receiver for `topic`. The caller (the
|
||||
/// `Subscribe` RPC handler) is responsible for handling
|
||||
/// `RecvError::Lagged` by calling [`record_drops`].
|
||||
pub(crate) fn subscribe_topic(
|
||||
&self,
|
||||
topic: Topic,
|
||||
) -> Option<broadcast::Receiver<TelemetryMessage>> {
|
||||
self.topics.get(&topic).map(|c| c.tx.subscribe())
|
||||
}
|
||||
|
||||
pub(crate) fn register_client(&self) {
|
||||
self.subscribed_clients.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub(crate) fn deregister_client(&self) {
|
||||
self.subscribed_clients.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn record_drops(&self, client_id: &str, topic: Topic, n: u64) {
|
||||
if n == 0 {
|
||||
return;
|
||||
}
|
||||
let mut map = self.drops.lock();
|
||||
let key = (client_id.to_string(), topic);
|
||||
map.entry(key)
|
||||
.or_insert_with(|| AtomicU64::new(0))
|
||||
.fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn snapshot(&self) -> PublisherSnapshot {
|
||||
let mut per_topic = HashMap::with_capacity(self.topics.len());
|
||||
let mut total_published = 0u64;
|
||||
for (&t, c) in &self.topics {
|
||||
let published = c.published.load(Ordering::Relaxed);
|
||||
let bytes_out = c.bytes_out.load(Ordering::Relaxed);
|
||||
total_published = total_published.saturating_add(published);
|
||||
per_topic.insert(
|
||||
t,
|
||||
PerTopicCounters {
|
||||
published,
|
||||
bytes_out,
|
||||
},
|
||||
);
|
||||
}
|
||||
let drops_map = self.drops.lock();
|
||||
let drops_total: HashMap<(String, Topic), u64> = drops_map
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.load(Ordering::Relaxed)))
|
||||
.collect();
|
||||
PublisherSnapshot {
|
||||
subscribed_clients: self.subscribed_clients.load(Ordering::Relaxed),
|
||||
published_total: total_published,
|
||||
per_topic,
|
||||
drops_total,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq)]
|
||||
struct Sample {
|
||||
v: u32,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn publish_with_no_subscribers_is_no_op() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
|
||||
// Act
|
||||
pub_.publish(Topic::TelemetrySample, &Sample { v: 1 })
|
||||
.unwrap();
|
||||
|
||||
// Assert
|
||||
let snap = pub_.snapshot();
|
||||
assert_eq!(snap.subscribed_clients, 0);
|
||||
assert_eq!(snap.per_topic[&Topic::TelemetrySample].published, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn slow_subscriber_lags_fast_subscriber_does_not() {
|
||||
// Arrange — capacity 4 so 100 publishes overflow trivially.
|
||||
let pub_ = TelemetryPublisher::new(4);
|
||||
let mut fast = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
||||
let mut slow = pub_.subscribe_topic(Topic::TelemetrySample).unwrap();
|
||||
|
||||
// Act — burst 100 messages WITHOUT giving `slow` a chance to drain.
|
||||
for v in 0..100u32 {
|
||||
pub_.publish(Topic::TelemetrySample, &Sample { v }).unwrap();
|
||||
// fast drains immediately; slow does nothing.
|
||||
let _ = fast.recv().await.unwrap();
|
||||
}
|
||||
|
||||
// Drain `slow` and count lag.
|
||||
let mut total_drops: u64 = 0;
|
||||
let mut delivered: u64 = 0;
|
||||
loop {
|
||||
match slow.try_recv() {
|
||||
Ok(_) => delivered += 1,
|
||||
Err(broadcast::error::TryRecvError::Lagged(n)) => total_drops += n,
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
|
||||
// Assert — fast got all 100; slow lost most but delivered ≤ capacity.
|
||||
assert!(delivered <= 4, "slow can hold at most capacity (4)");
|
||||
assert_eq!(
|
||||
total_drops + delivered,
|
||||
100,
|
||||
"every published msg accounted for"
|
||||
);
|
||||
assert!(total_drops > 0, "slow subscriber MUST have lagged");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_drops_accumulates_per_client_topic() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
|
||||
// Act
|
||||
pub_.record_drops("client_a", Topic::TelemetrySample, 5);
|
||||
pub_.record_drops("client_a", Topic::TelemetrySample, 3);
|
||||
pub_.record_drops("client_a", Topic::GimbalState, 2);
|
||||
pub_.record_drops("client_b", Topic::TelemetrySample, 1);
|
||||
|
||||
// Assert
|
||||
let snap = pub_.snapshot();
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_a".to_string(), Topic::TelemetrySample)],
|
||||
8
|
||||
);
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_a".to_string(), Topic::GimbalState)],
|
||||
2
|
||||
);
|
||||
assert_eq!(
|
||||
snap.drops_total[&("client_b".to_string(), Topic::TelemetrySample)],
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn register_deregister_balance_tracks_subscribed_clients() {
|
||||
// Arrange
|
||||
let pub_ = TelemetryPublisher::new(8);
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 0);
|
||||
|
||||
// Act
|
||||
pub_.register_client();
|
||||
pub_.register_client();
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 2);
|
||||
pub_.deregister_client();
|
||||
|
||||
// Assert
|
||||
assert_eq!(pub_.snapshot().subscribed_clients, 1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
//! AZ-675 — gRPC `TelemetryStream::Subscribe` service implementation.
|
||||
//!
|
||||
//! The client sends a single `SubscribeRequest`; the server returns a
|
||||
//! server-streaming response built directly from per-topic
|
||||
//! `BroadcastStream`s merged with `StreamMap`. The tonic transport
|
||||
//! is what polls our stream — when the wire (or the operator client)
|
||||
//! cannot keep up, the broadcast ring overflows that client's cursor
|
||||
//! and `BroadcastStream` yields `Err(BroadcastStreamRecvError::Lagged(n))`
|
||||
//! on the next poll. That is the *only* place drop accounting
|
||||
//! happens: there is no intermediate mpsc buffer that could absorb
|
||||
//! back-pressure and hide lag.
|
||||
//!
|
||||
//! `StreamGuard` decrements `subscribed_clients` on stream drop.
|
||||
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use tokio_stream::wrappers::errors::BroadcastStreamRecvError;
|
||||
use tokio_stream::wrappers::BroadcastStream;
|
||||
use tokio_stream::{Stream, StreamExt, StreamMap};
|
||||
use tonic::{Request, Response, Status};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::internal::proto::telemetry_stream_server::TelemetryStream;
|
||||
use crate::internal::proto::{SubscribeRequest, TelemetryMessage, Topic};
|
||||
use crate::internal::publisher::{TelemetryPublisher, ALL_TOPICS};
|
||||
|
||||
pub struct TelemetryService {
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
}
|
||||
|
||||
impl TelemetryService {
|
||||
pub fn new(publisher: Arc<TelemetryPublisher>) -> Self {
|
||||
Self { publisher }
|
||||
}
|
||||
}
|
||||
|
||||
type SubscribeStream = Pin<Box<dyn Stream<Item = Result<TelemetryMessage, Status>> + Send>>;
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl TelemetryStream for TelemetryService {
|
||||
type SubscribeStream = SubscribeStream;
|
||||
|
||||
async fn subscribe(
|
||||
&self,
|
||||
request: Request<SubscribeRequest>,
|
||||
) -> Result<Response<Self::SubscribeStream>, Status> {
|
||||
let req = request.into_inner();
|
||||
if req.client_id.trim().is_empty() {
|
||||
return Err(Status::invalid_argument("client_id is required"));
|
||||
}
|
||||
let client_id = req.client_id.clone();
|
||||
|
||||
let requested: Vec<Topic> = if req.topics.is_empty() {
|
||||
ALL_TOPICS.to_vec()
|
||||
} else {
|
||||
let mut out = Vec::with_capacity(req.topics.len());
|
||||
for raw in &req.topics {
|
||||
let t = Topic::try_from(*raw)
|
||||
.map_err(|_| Status::invalid_argument(format!("unknown topic {raw}")))?;
|
||||
if matches!(t, Topic::Unspecified) {
|
||||
return Err(Status::invalid_argument("TOPIC_UNSPECIFIED not allowed"));
|
||||
}
|
||||
out.push(t);
|
||||
}
|
||||
out
|
||||
};
|
||||
|
||||
let mut map: StreamMap<Topic, BroadcastStream<TelemetryMessage>> = StreamMap::new();
|
||||
for &t in &requested {
|
||||
match self.publisher.subscribe_topic(t) {
|
||||
Some(rx) => {
|
||||
map.insert(t, BroadcastStream::new(rx));
|
||||
}
|
||||
None => {
|
||||
return Err(Status::failed_precondition(format!(
|
||||
"topic {t:?} not registered"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.publisher.register_client();
|
||||
info!(client_id = %client_id, topics = ?requested, "telemetry subscribe");
|
||||
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let cid = client_id.clone();
|
||||
let stream = map.filter_map(move |(topic, item)| match item {
|
||||
Ok(msg) => Some(Ok(msg)),
|
||||
Err(BroadcastStreamRecvError::Lagged(n)) => {
|
||||
warn!(client_id = %cid, ?topic, dropped = n, "slow client lagged");
|
||||
publisher.record_drops(&cid, topic, n);
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let stream = StreamGuard {
|
||||
inner: stream,
|
||||
publisher: Arc::clone(&self.publisher),
|
||||
};
|
||||
|
||||
Ok(Response::new(Box::pin(stream) as Self::SubscribeStream))
|
||||
}
|
||||
}
|
||||
|
||||
/// Decrement `subscribed_clients` when the per-client outbound
|
||||
/// stream is dropped (tonic drops the stream when the client side
|
||||
/// goes away).
|
||||
struct StreamGuard<S> {
|
||||
inner: S,
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
}
|
||||
|
||||
impl<S: Stream + Unpin> Stream for StreamGuard<S> {
|
||||
type Item = S::Item;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
Pin::new(&mut self.inner).poll_next(cx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Drop for StreamGuard<S> {
|
||||
fn drop(&mut self) {
|
||||
self.publisher.deregister_client();
|
||||
}
|
||||
}
|
||||
@@ -1,57 +1,224 @@
|
||||
//! `telemetry_stream` — always-on uplink to the Ground Station + operator-command downlink.
|
||||
//!
|
||||
//! Real implementation lands in:
|
||||
//! - AZ-675 `telemetry_stream_grpc_server`
|
||||
//! - AZ-676 `telemetry_stream_video_path`
|
||||
//! - AZ-677 `telemetry_stream_mapobjects_snapshot`
|
||||
//! Real implementations:
|
||||
//! - **AZ-675 (this crate, this batch)**: Tonic gRPC server, per-client
|
||||
//! bounded queue, drop-oldest back-pressure, drop counters. Topics:
|
||||
//! `TelemetrySample`, `GimbalState`, `DetectionEvent`,
|
||||
//! `MovementCandidate`, `MapObjectsBundle`.
|
||||
//! - **AZ-676**: video frame topic (separate RPC, server-streamed
|
||||
//! binary payloads).
|
||||
//! - **AZ-677**: diff-based snapshot emission for `MapObjectsBundle`.
|
||||
//! - **AZ-678+**: command-auth on the return path (operator_bridge).
|
||||
|
||||
pub mod internal;
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::task::JoinHandle;
|
||||
use tonic::transport::Server;
|
||||
|
||||
use shared::contracts::TelemetrySink;
|
||||
use shared::error::{AutopilotError, Result};
|
||||
use shared::health::ComponentHealth;
|
||||
use shared::health::{ComponentHealth, HealthLevel};
|
||||
use shared::models::detection::DetectionBatch;
|
||||
use shared::models::frame::Frame;
|
||||
use shared::models::operator::OperatorCommand;
|
||||
|
||||
use crate::internal::proto::telemetry_stream_server::TelemetryStreamServer;
|
||||
use crate::internal::proto::Topic;
|
||||
use crate::internal::publisher::{TelemetryPublisher, DEFAULT_TOPIC_CAPACITY};
|
||||
use crate::internal::server::TelemetryService;
|
||||
|
||||
pub use crate::internal::proto::{
|
||||
telemetry_stream_client::TelemetryStreamClient, SubscribeRequest, TelemetryMessage,
|
||||
Topic as TelemetryTopic,
|
||||
};
|
||||
pub use crate::internal::publisher::{
|
||||
PerTopicCounters, PublishError, PublisherSnapshot, ALL_TOPICS,
|
||||
};
|
||||
|
||||
const NAME: &str = "telemetry_stream";
|
||||
|
||||
/// Per-(client, topic) drop rate at or above which health flips to
|
||||
/// yellow. Picked to surface persistent slow consumers without
|
||||
/// flapping on a single transient lag spike.
|
||||
const DROP_YELLOW_THRESHOLD: u64 = 100;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TelemetryStreamConfig {
|
||||
/// Where the Tonic gRPC server binds. `0.0.0.0:50061` by default.
|
||||
pub listen_addr: SocketAddr,
|
||||
/// Per-topic broadcast capacity (per subscriber buffer).
|
||||
pub topic_capacity: usize,
|
||||
/// Bounded capacity of the downlink command channel that feeds
|
||||
/// `operator_bridge`.
|
||||
pub downlink_capacity: usize,
|
||||
}
|
||||
|
||||
impl Default for TelemetryStreamConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
listen_addr: "0.0.0.0:50061".parse().expect("hardcoded addr parses"),
|
||||
topic_capacity: DEFAULT_TOPIC_CAPACITY,
|
||||
downlink_capacity: 64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TelemetryStream {
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
commands_tx: mpsc::Sender<OperatorCommand>,
|
||||
commands_rx: Option<mpsc::Receiver<OperatorCommand>>,
|
||||
config: TelemetryStreamConfig,
|
||||
}
|
||||
|
||||
impl TelemetryStream {
|
||||
pub fn new(downlink_capacity: usize) -> Self {
|
||||
let (commands_tx, commands_rx) = mpsc::channel(downlink_capacity);
|
||||
Self::with_config(TelemetryStreamConfig {
|
||||
downlink_capacity,
|
||||
..TelemetryStreamConfig::default()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn with_config(config: TelemetryStreamConfig) -> Self {
|
||||
let publisher = TelemetryPublisher::new(config.topic_capacity);
|
||||
let (commands_tx, commands_rx) = mpsc::channel(config.downlink_capacity);
|
||||
Self {
|
||||
publisher,
|
||||
commands_tx,
|
||||
commands_rx: Some(commands_rx),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn handle(&self) -> TelemetryStreamHandle {
|
||||
TelemetryStreamHandle {
|
||||
publisher: Arc::clone(&self.publisher),
|
||||
commands_tx: self.commands_tx.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Take the downlink command receiver. The composition root forwards it to
|
||||
/// `operator_bridge` as `Receiver<OperatorCommand>`.
|
||||
/// Take the downlink command receiver. The composition root
|
||||
/// forwards it to `operator_bridge` as `Receiver<OperatorCommand>`.
|
||||
pub fn take_command_receiver(&mut self) -> Option<mpsc::Receiver<OperatorCommand>> {
|
||||
self.commands_rx.take()
|
||||
}
|
||||
|
||||
/// Spawn the Tonic server. Returns a JoinHandle that runs until
|
||||
/// `shutdown` is signalled (closing the returned `shutdown_tx`).
|
||||
/// The server is bound on `config.listen_addr`.
|
||||
pub fn spawn_grpc_server(
|
||||
&self,
|
||||
) -> Result<(
|
||||
JoinHandle<std::result::Result<(), tonic::transport::Error>>,
|
||||
GrpcShutdown,
|
||||
)> {
|
||||
let listen_addr = self.config.listen_addr;
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
|
||||
let svc = TelemetryStreamServer::new(TelemetryService::new(publisher));
|
||||
let join = tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(svc)
|
||||
.serve_with_shutdown(listen_addr, async move {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
Ok((
|
||||
join,
|
||||
GrpcShutdown {
|
||||
tx: Some(shutdown_tx),
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
/// Spawn the Tonic server bound on a specific `TcpListener`.
|
||||
/// Useful for tests that need to know the actual port ahead of
|
||||
/// time (bind to `127.0.0.1:0` then read the assigned port).
|
||||
pub fn spawn_grpc_server_on(
|
||||
&self,
|
||||
listener: std::net::TcpListener,
|
||||
) -> Result<(
|
||||
JoinHandle<std::result::Result<(), tonic::transport::Error>>,
|
||||
GrpcShutdown,
|
||||
)> {
|
||||
listener
|
||||
.set_nonblocking(true)
|
||||
.map_err(|e| AutopilotError::Internal(format!("set_nonblocking: {e}")))?;
|
||||
let tokio_listener = tokio::net::TcpListener::from_std(listener)
|
||||
.map_err(|e| AutopilotError::Internal(format!("TcpListener::from_std: {e}")))?;
|
||||
let stream = tokio_stream::wrappers::TcpListenerStream::new(tokio_listener);
|
||||
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
let svc = TelemetryStreamServer::new(TelemetryService::new(publisher));
|
||||
|
||||
let join = tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(svc)
|
||||
.serve_with_incoming_shutdown(stream, async move {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
Ok((
|
||||
join,
|
||||
GrpcShutdown {
|
||||
tx: Some(shutdown_tx),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII shutdown trigger for the spawned gRPC server. Drop the value
|
||||
/// or call `shutdown()` to stop the server.
|
||||
pub struct GrpcShutdown {
|
||||
tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
impl GrpcShutdown {
|
||||
pub fn shutdown(mut self) {
|
||||
if let Some(tx) = self.tx.take() {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GrpcShutdown {
|
||||
fn drop(&mut self) {
|
||||
if let Some(tx) = self.tx.take() {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TelemetryStreamHandle {
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
commands_tx: mpsc::Sender<OperatorCommand>,
|
||||
}
|
||||
|
||||
impl TelemetryStreamHandle {
|
||||
/// Inject an operator command. Production path is fed by the downlink
|
||||
/// receiver in `internal::downlink/*`; tests can call this directly.
|
||||
/// Publish a payload on `topic`. Never blocks the caller; slow
|
||||
/// subscribers experience drops accounted in [`snapshot`].
|
||||
pub fn publish<T: serde::Serialize>(
|
||||
&self,
|
||||
topic: TelemetryTopic,
|
||||
payload: &T,
|
||||
) -> std::result::Result<(), PublishError> {
|
||||
self.publisher.publish(topic, payload)
|
||||
}
|
||||
|
||||
/// Inject an operator command downlink. Production path is fed
|
||||
/// by the gRPC return half once AZ-678 lands; tests may call this
|
||||
/// directly.
|
||||
pub async fn submit_command(&self, command: OperatorCommand) -> Result<()> {
|
||||
self.commands_tx
|
||||
.send(command)
|
||||
@@ -59,8 +226,32 @@ impl TelemetryStreamHandle {
|
||||
.map_err(|_| AutopilotError::Internal("downlink channel closed".into()))
|
||||
}
|
||||
|
||||
pub fn snapshot(&self) -> PublisherSnapshot {
|
||||
self.publisher.snapshot()
|
||||
}
|
||||
|
||||
pub fn health(&self) -> ComponentHealth {
|
||||
ComponentHealth::disabled(NAME)
|
||||
let snap = self.publisher.snapshot();
|
||||
let mut h = ComponentHealth::green(NAME);
|
||||
|
||||
let hot_drops: Vec<_> = snap
|
||||
.drops_total
|
||||
.iter()
|
||||
.filter(|(_, &v)| v >= DROP_YELLOW_THRESHOLD)
|
||||
.collect();
|
||||
|
||||
let detail = format!(
|
||||
"subscribers={} published_total={} hot_drop_pairs={}",
|
||||
snap.subscribed_clients,
|
||||
snap.published_total,
|
||||
hot_drops.len()
|
||||
);
|
||||
|
||||
if !hot_drops.is_empty() {
|
||||
h.level = HealthLevel::Yellow;
|
||||
}
|
||||
h.detail = Some(detail);
|
||||
h
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,14 +259,14 @@ impl TelemetryStreamHandle {
|
||||
impl TelemetrySink for TelemetryStreamHandle {
|
||||
async fn push_frame(&self, _frame: Frame) -> Result<()> {
|
||||
Err(AutopilotError::NotImplemented(
|
||||
"telemetry_stream::push_frame (AZ-676)",
|
||||
"telemetry_stream::push_frame (AZ-676 video path)",
|
||||
))
|
||||
}
|
||||
|
||||
async fn push_detections(&self, _batch: DetectionBatch) -> Result<()> {
|
||||
Err(AutopilotError::NotImplemented(
|
||||
"telemetry_stream::push_detections (AZ-675)",
|
||||
))
|
||||
async fn push_detections(&self, batch: DetectionBatch) -> Result<()> {
|
||||
self.publisher
|
||||
.publish(Topic::DetectionEvent, &batch)
|
||||
.map_err(|e| AutopilotError::Internal(format!("publish detections: {e}")))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,8 +275,35 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_compiles() {
|
||||
let h = TelemetryStream::new(8).handle();
|
||||
assert_eq!(h.health().level, shared::health::HealthLevel::Disabled);
|
||||
fn handle_starts_with_zero_subscribers_and_green_health() {
|
||||
// Arrange
|
||||
let s = TelemetryStream::new(8);
|
||||
let h = s.handle();
|
||||
|
||||
// Act
|
||||
let snap = h.snapshot();
|
||||
let health = h.health();
|
||||
|
||||
// Assert
|
||||
assert_eq!(snap.subscribed_clients, 0);
|
||||
assert_eq!(snap.published_total, 0);
|
||||
assert_eq!(health.level, HealthLevel::Green);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn publish_without_subscribers_is_no_op_but_counts() {
|
||||
// Arrange
|
||||
let s = TelemetryStream::new(8);
|
||||
let h = s.handle();
|
||||
|
||||
// Act
|
||||
h.publish(
|
||||
TelemetryTopic::TelemetrySample,
|
||||
&serde_json::json!({"v": 1}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Assert
|
||||
assert_eq!(h.snapshot().per_topic[&Topic::TelemetrySample].published, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,366 @@
|
||||
//! AZ-675 integration tests — Tonic server + per-client lossy queue
|
||||
//! exercised through an in-process gRPC client (the only stub allowed
|
||||
//! per the task's Runtime Completeness gate).
|
||||
|
||||
use std::net::TcpListener;
|
||||
use std::time::Duration;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::timeout;
|
||||
use tokio_stream::StreamExt;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tonic::Request;
|
||||
|
||||
use telemetry_stream::{SubscribeRequest, TelemetryStream, TelemetryStreamClient, TelemetryTopic};
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq, Clone)]
|
||||
struct Sample {
|
||||
n: u32,
|
||||
}
|
||||
|
||||
/// Bind a TCP listener on `127.0.0.1:0`, return the listener and the
|
||||
/// resolved port. The test stack uses ephemeral ports so multiple
|
||||
/// integration tests in the same suite never collide.
|
||||
fn bind_ephemeral() -> (TcpListener, u16) {
|
||||
let l = TcpListener::bind("127.0.0.1:0").expect("bind ephemeral");
|
||||
let port = l.local_addr().unwrap().port();
|
||||
(l, port)
|
||||
}
|
||||
|
||||
async fn connect(port: u16) -> TelemetryStreamClient<Channel> {
|
||||
let url = format!("http://127.0.0.1:{port}");
|
||||
let endpoint = Endpoint::from_shared(url)
|
||||
.unwrap()
|
||||
.connect_timeout(Duration::from_secs(2));
|
||||
|
||||
// Tiny retry loop — the server can take a few ms to bind.
|
||||
for _ in 0..50 {
|
||||
if let Ok(c) = TelemetryStreamClient::connect(endpoint.clone()).await {
|
||||
return c;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
}
|
||||
panic!("gRPC client failed to connect within ~1s");
|
||||
}
|
||||
|
||||
/// AC-1 — multiple subscribers receive the same stream, ordering preserved.
|
||||
#[tokio::test]
|
||||
async fn ac1_multiple_subscribers_receive_same_stream() {
|
||||
// Arrange
|
||||
let (listener, port) = bind_ephemeral();
|
||||
let server = TelemetryStream::new(8);
|
||||
let handle = server.handle();
|
||||
let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap();
|
||||
|
||||
let mut c1 = connect(port).await;
|
||||
let mut c2 = connect(port).await;
|
||||
let mut c3 = connect(port).await;
|
||||
|
||||
let req = |id: &str| {
|
||||
Request::new(SubscribeRequest {
|
||||
client_id: id.to_string(),
|
||||
topics: vec![TelemetryTopic::TelemetrySample as i32],
|
||||
})
|
||||
};
|
||||
|
||||
let mut s1 = c1.subscribe(req("a")).await.unwrap().into_inner();
|
||||
let mut s2 = c2.subscribe(req("b")).await.unwrap().into_inner();
|
||||
let mut s3 = c3.subscribe(req("c")).await.unwrap().into_inner();
|
||||
|
||||
// Give the server time to register all three before publishing.
|
||||
for _ in 0..50 {
|
||||
if handle.snapshot().subscribed_clients == 3 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
assert_eq!(handle.snapshot().subscribed_clients, 3);
|
||||
|
||||
// Act
|
||||
for n in 0..100u32 {
|
||||
handle
|
||||
.publish(TelemetryTopic::TelemetrySample, &Sample { n })
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
async fn collect100(
|
||||
s: &mut tonic::Streaming<telemetry_stream::TelemetryMessage>,
|
||||
) -> Vec<Sample> {
|
||||
let mut out = Vec::new();
|
||||
while out.len() < 100 {
|
||||
let msg = timeout(Duration::from_secs(2), s.next())
|
||||
.await
|
||||
.expect("ac1 client did not receive 100 in 2s")
|
||||
.expect("stream ended")
|
||||
.expect("server status");
|
||||
let sample: Sample = serde_json::from_slice(&msg.payload_json).unwrap();
|
||||
out.push(sample);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
let r1 = collect100(&mut s1).await;
|
||||
let r2 = collect100(&mut s2).await;
|
||||
let r3 = collect100(&mut s3).await;
|
||||
|
||||
// Assert — all three receive all 100 in order.
|
||||
let expected: Vec<Sample> = (0..100).map(|n| Sample { n }).collect();
|
||||
assert_eq!(r1, expected);
|
||||
assert_eq!(r2, expected);
|
||||
assert_eq!(r3, expected);
|
||||
}
|
||||
|
||||
/// AC-2 — slow subscriber drops oldest, healthy unaffected.
|
||||
///
|
||||
/// We can't simulate "client never polls" through real tonic+H2
|
||||
/// because the OS TCP + HTTP/2 receive buffers (often 256 KB+) will
|
||||
/// absorb a small JSON burst entirely with no observable drop. The
|
||||
/// realistic AC-2 condition is "slow drain rate" — the slow client
|
||||
/// polls but at a fraction of the publish rate. With large payloads
|
||||
/// and a small per-topic broadcast capacity, the slow client's
|
||||
/// broadcast cursor falls behind the producer and `BroadcastStream`
|
||||
/// emits `Lagged(n)` — the only signal AZ-675 wires into the per-
|
||||
/// `(client_id, topic)` drop counters.
|
||||
#[tokio::test]
|
||||
async fn ac2_slow_subscriber_drops_oldest_healthy_unaffected() {
|
||||
// Arrange — capacity 4, big payloads so wire back-pressure
|
||||
// propagates fast and broadcast lag is observable end-to-end.
|
||||
let (listener, port) = bind_ephemeral();
|
||||
let cfg = telemetry_stream::TelemetryStreamConfig {
|
||||
topic_capacity: 4,
|
||||
..telemetry_stream::TelemetryStreamConfig::default()
|
||||
};
|
||||
let server = TelemetryStream::with_config(cfg);
|
||||
let handle = server.handle();
|
||||
let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap();
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct Big {
|
||||
n: u32,
|
||||
// 64 KB filler per message — fills H2 receive window quickly.
|
||||
pad: Vec<u8>,
|
||||
}
|
||||
|
||||
let mut c_fast = connect(port).await;
|
||||
let mut c_slow = connect(port).await;
|
||||
let req = |id: &str| {
|
||||
Request::new(SubscribeRequest {
|
||||
client_id: id.to_string(),
|
||||
topics: vec![TelemetryTopic::TelemetrySample as i32],
|
||||
})
|
||||
};
|
||||
let mut s_fast = c_fast.subscribe(req("fast")).await.unwrap().into_inner();
|
||||
let mut s_slow = c_slow.subscribe(req("slow")).await.unwrap().into_inner();
|
||||
|
||||
for _ in 0..50 {
|
||||
if handle.snapshot().subscribed_clients == 2 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
assert_eq!(handle.snapshot().subscribed_clients, 2);
|
||||
|
||||
let total: u32 = 300;
|
||||
|
||||
// Fast drains immediately, slow drains 1 msg every 50ms (way below publish rate).
|
||||
let fast_task = tokio::spawn(async move {
|
||||
let mut seen = 0u32;
|
||||
while seen < total {
|
||||
let _msg = timeout(Duration::from_secs(15), s_fast.next())
|
||||
.await
|
||||
.expect("fast did not receive everything in 15s")
|
||||
.expect("stream ended")
|
||||
.expect("server status");
|
||||
seen += 1;
|
||||
}
|
||||
seen
|
||||
});
|
||||
let slow_task = tokio::spawn(async move {
|
||||
let mut received = 0u32;
|
||||
let deadline = std::time::Instant::now() + Duration::from_secs(30);
|
||||
while std::time::Instant::now() < deadline {
|
||||
match timeout(Duration::from_millis(500), s_slow.next()).await {
|
||||
Ok(Some(Ok(_msg))) => received += 1,
|
||||
Ok(Some(Err(_))) => break,
|
||||
Ok(None) => break,
|
||||
Err(_) => break,
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
received
|
||||
});
|
||||
|
||||
// Act — burst publishes at ~1ms cadence with 64 KB payloads so
|
||||
// broadcast capacity is overrun for the slow consumer.
|
||||
let payload_pad = vec![0u8; 64 * 1024];
|
||||
for n in 0..total {
|
||||
handle
|
||||
.publish(
|
||||
TelemetryTopic::TelemetrySample,
|
||||
&Big {
|
||||
n,
|
||||
pad: payload_pad.clone(),
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
tokio::time::sleep(Duration::from_millis(2)).await;
|
||||
}
|
||||
|
||||
let fast_count = fast_task.await.unwrap();
|
||||
let slow_count = slow_task.await.unwrap();
|
||||
|
||||
let snap = handle.snapshot();
|
||||
let slow_drops = snap
|
||||
.drops_total
|
||||
.get(&("slow".to_string(), TelemetryTopic::TelemetrySample))
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
let fast_drops = snap
|
||||
.drops_total
|
||||
.get(&("fast".to_string(), TelemetryTopic::TelemetrySample))
|
||||
.copied()
|
||||
.unwrap_or(0);
|
||||
|
||||
// Assert
|
||||
assert_eq!(fast_count, total, "fast client receives every message");
|
||||
assert_eq!(fast_drops, 0, "fast client MUST NOT have drops");
|
||||
assert!(
|
||||
slow_drops > 0,
|
||||
"slow client MUST have lagged at some point (got 0 drops; slow_count={slow_count})"
|
||||
);
|
||||
assert!(
|
||||
slow_count < total,
|
||||
"slow client should have lost messages (got {slow_count}/{total})"
|
||||
);
|
||||
assert!(
|
||||
(slow_count as u64) + slow_drops <= (total as u64) + 4,
|
||||
"accounted={} should not exceed total+capacity",
|
||||
(slow_count as u64) + slow_drops
|
||||
);
|
||||
}
|
||||
|
||||
/// AC-3 — disconnect cleanly removes subscriber.
|
||||
#[tokio::test]
|
||||
async fn ac3_disconnect_decrements_subscribed_clients() {
|
||||
// Arrange
|
||||
let (listener, port) = bind_ephemeral();
|
||||
let server = TelemetryStream::new(8);
|
||||
let handle = server.handle();
|
||||
let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap();
|
||||
|
||||
let mut c1 = connect(port).await;
|
||||
let req = || {
|
||||
Request::new(SubscribeRequest {
|
||||
client_id: "ephemeral".to_string(),
|
||||
topics: vec![],
|
||||
})
|
||||
};
|
||||
let s1 = c1.subscribe(req()).await.unwrap().into_inner();
|
||||
|
||||
for _ in 0..50 {
|
||||
if handle.snapshot().subscribed_clients == 1 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
assert_eq!(handle.snapshot().subscribed_clients, 1);
|
||||
|
||||
// Act — drop the stream (simulates client cancel).
|
||||
drop(s1);
|
||||
drop(c1);
|
||||
|
||||
// Assert — register/deregister is eventually consistent; wait briefly.
|
||||
let mut final_count = 99;
|
||||
for _ in 0..100 {
|
||||
final_count = handle.snapshot().subscribed_clients;
|
||||
if final_count == 0 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
}
|
||||
assert_eq!(
|
||||
final_count, 0,
|
||||
"disconnect must decrement subscribed_clients"
|
||||
);
|
||||
}
|
||||
|
||||
/// Subscribing to an empty `topics` list defaults to ALL topics.
|
||||
#[tokio::test]
|
||||
async fn empty_topics_list_defaults_to_all() {
|
||||
// Arrange
|
||||
let (listener, port) = bind_ephemeral();
|
||||
let server = TelemetryStream::new(8);
|
||||
let handle = server.handle();
|
||||
let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap();
|
||||
|
||||
let mut client = connect(port).await;
|
||||
let mut stream = client
|
||||
.subscribe(Request::new(SubscribeRequest {
|
||||
client_id: "all".to_string(),
|
||||
topics: vec![],
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
for _ in 0..50 {
|
||||
if handle.snapshot().subscribed_clients == 1 {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
|
||||
// Act — publish one message on each topic.
|
||||
handle
|
||||
.publish(TelemetryTopic::TelemetrySample, &Sample { n: 1 })
|
||||
.unwrap();
|
||||
handle
|
||||
.publish(TelemetryTopic::GimbalState, &Sample { n: 2 })
|
||||
.unwrap();
|
||||
handle
|
||||
.publish(TelemetryTopic::DetectionEvent, &Sample { n: 3 })
|
||||
.unwrap();
|
||||
handle
|
||||
.publish(TelemetryTopic::MovementCandidate, &Sample { n: 4 })
|
||||
.unwrap();
|
||||
handle
|
||||
.publish(TelemetryTopic::MapObjectsBundle, &Sample { n: 5 })
|
||||
.unwrap();
|
||||
|
||||
let mut seen_topics = std::collections::HashSet::new();
|
||||
while seen_topics.len() < 5 {
|
||||
let msg = timeout(Duration::from_secs(2), stream.next())
|
||||
.await
|
||||
.expect("did not receive all 5 topics in 2s")
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
seen_topics.insert(msg.topic);
|
||||
}
|
||||
|
||||
// Assert
|
||||
assert_eq!(seen_topics.len(), 5);
|
||||
}
|
||||
|
||||
/// Empty client_id is rejected at the server boundary.
|
||||
#[tokio::test]
|
||||
async fn empty_client_id_is_rejected() {
|
||||
// Arrange
|
||||
let (listener, port) = bind_ephemeral();
|
||||
let server = TelemetryStream::new(8);
|
||||
let _h = server.handle();
|
||||
let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap();
|
||||
|
||||
let mut client = connect(port).await;
|
||||
|
||||
// Act
|
||||
let err = client
|
||||
.subscribe(Request::new(SubscribeRequest {
|
||||
client_id: String::new(),
|
||||
topics: vec![],
|
||||
}))
|
||||
.await
|
||||
.expect_err("empty client_id must return Status error");
|
||||
|
||||
// Assert
|
||||
assert_eq!(err.code(), tonic::Code::InvalidArgument);
|
||||
}
|
||||
Reference in New Issue
Block a user