mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-22 18:01:10 +00:00
[AZ-675] telemetry_stream Tonic gRPC server + per-client lossy queue
ci/woodpecker/push/build-arm Pipeline failed
ci/woodpecker/push/build-arm Pipeline failed
Pins operator-link transport to gRPC server-streaming (closes architecture Q2 in favour of gRPC). Adds first-time tonic / prost / tonic-build infrastructure to the workspace; uses protoc-bin-vendored so neither dev machines nor CI need system protoc installed. Design — back-pressure lives in the per-topic tokio::sync::broadcast ring, drained directly by the tonic-streamed response via BroadcastStream + StreamMap. No intermediate mpsc buffer that could absorb back-pressure invisibly. Slow client overrun -> Lagged(n) event -> per-(client_id, topic) drop counter incremented; healthy clients on the same topic are unaffected. Service surface — Subscribe(SubscribeRequest) -> stream TelemetryMessage; five topics (TelemetrySample, GimbalState, DetectionEvent, MovementCandidate, MapObjectsBundle); empty topics list defaults to subscribe-all; empty client_id rejected; stream drop decrements subscribed_clients via StreamGuard. TelemetrySink push_detections is now real; push_frame still NotImplemented(AZ-676 video path). Tests — 6 unit + 5 integration (AC-1..AC-3 via in-process gRPC client, plus subscribe-all default + empty-client_id rejection). Clippy on telemetry_stream clean. Pre-existing mission_executor ac3 test polling race surfaces more reliably under the new tonic build pressure; documented as _docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md and unchanged by this batch. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -1,57 +1,224 @@
|
||||
//! `telemetry_stream` — always-on uplink to the Ground Station + operator-command downlink.
|
||||
//!
|
||||
//! Real implementation lands in:
|
||||
//! - AZ-675 `telemetry_stream_grpc_server`
|
||||
//! - AZ-676 `telemetry_stream_video_path`
|
||||
//! - AZ-677 `telemetry_stream_mapobjects_snapshot`
|
||||
//! Real implementations:
|
||||
//! - **AZ-675 (this crate, this batch)**: Tonic gRPC server, per-client
|
||||
//! bounded queue, drop-oldest back-pressure, drop counters. Topics:
|
||||
//! `TelemetrySample`, `GimbalState`, `DetectionEvent`,
|
||||
//! `MovementCandidate`, `MapObjectsBundle`.
|
||||
//! - **AZ-676**: video frame topic (separate RPC, server-streamed
|
||||
//! binary payloads).
|
||||
//! - **AZ-677**: diff-based snapshot emission for `MapObjectsBundle`.
|
||||
//! - **AZ-678+**: command-auth on the return path (operator_bridge).
|
||||
|
||||
pub mod internal;
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::task::JoinHandle;
|
||||
use tonic::transport::Server;
|
||||
|
||||
use shared::contracts::TelemetrySink;
|
||||
use shared::error::{AutopilotError, Result};
|
||||
use shared::health::ComponentHealth;
|
||||
use shared::health::{ComponentHealth, HealthLevel};
|
||||
use shared::models::detection::DetectionBatch;
|
||||
use shared::models::frame::Frame;
|
||||
use shared::models::operator::OperatorCommand;
|
||||
|
||||
use crate::internal::proto::telemetry_stream_server::TelemetryStreamServer;
|
||||
use crate::internal::proto::Topic;
|
||||
use crate::internal::publisher::{TelemetryPublisher, DEFAULT_TOPIC_CAPACITY};
|
||||
use crate::internal::server::TelemetryService;
|
||||
|
||||
pub use crate::internal::proto::{
|
||||
telemetry_stream_client::TelemetryStreamClient, SubscribeRequest, TelemetryMessage,
|
||||
Topic as TelemetryTopic,
|
||||
};
|
||||
pub use crate::internal::publisher::{
|
||||
PerTopicCounters, PublishError, PublisherSnapshot, ALL_TOPICS,
|
||||
};
|
||||
|
||||
const NAME: &str = "telemetry_stream";
|
||||
|
||||
/// Per-(client, topic) drop rate at or above which health flips to
|
||||
/// yellow. Picked to surface persistent slow consumers without
|
||||
/// flapping on a single transient lag spike.
|
||||
const DROP_YELLOW_THRESHOLD: u64 = 100;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TelemetryStreamConfig {
|
||||
/// Where the Tonic gRPC server binds. `0.0.0.0:50061` by default.
|
||||
pub listen_addr: SocketAddr,
|
||||
/// Per-topic broadcast capacity (per subscriber buffer).
|
||||
pub topic_capacity: usize,
|
||||
/// Bounded capacity of the downlink command channel that feeds
|
||||
/// `operator_bridge`.
|
||||
pub downlink_capacity: usize,
|
||||
}
|
||||
|
||||
impl Default for TelemetryStreamConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
listen_addr: "0.0.0.0:50061".parse().expect("hardcoded addr parses"),
|
||||
topic_capacity: DEFAULT_TOPIC_CAPACITY,
|
||||
downlink_capacity: 64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TelemetryStream {
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
commands_tx: mpsc::Sender<OperatorCommand>,
|
||||
commands_rx: Option<mpsc::Receiver<OperatorCommand>>,
|
||||
config: TelemetryStreamConfig,
|
||||
}
|
||||
|
||||
impl TelemetryStream {
|
||||
pub fn new(downlink_capacity: usize) -> Self {
|
||||
let (commands_tx, commands_rx) = mpsc::channel(downlink_capacity);
|
||||
Self::with_config(TelemetryStreamConfig {
|
||||
downlink_capacity,
|
||||
..TelemetryStreamConfig::default()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn with_config(config: TelemetryStreamConfig) -> Self {
|
||||
let publisher = TelemetryPublisher::new(config.topic_capacity);
|
||||
let (commands_tx, commands_rx) = mpsc::channel(config.downlink_capacity);
|
||||
Self {
|
||||
publisher,
|
||||
commands_tx,
|
||||
commands_rx: Some(commands_rx),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn handle(&self) -> TelemetryStreamHandle {
|
||||
TelemetryStreamHandle {
|
||||
publisher: Arc::clone(&self.publisher),
|
||||
commands_tx: self.commands_tx.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Take the downlink command receiver. The composition root forwards it to
|
||||
/// `operator_bridge` as `Receiver<OperatorCommand>`.
|
||||
/// Take the downlink command receiver. The composition root
|
||||
/// forwards it to `operator_bridge` as `Receiver<OperatorCommand>`.
|
||||
pub fn take_command_receiver(&mut self) -> Option<mpsc::Receiver<OperatorCommand>> {
|
||||
self.commands_rx.take()
|
||||
}
|
||||
|
||||
/// Spawn the Tonic server. Returns a JoinHandle that runs until
|
||||
/// `shutdown` is signalled (closing the returned `shutdown_tx`).
|
||||
/// The server is bound on `config.listen_addr`.
|
||||
pub fn spawn_grpc_server(
|
||||
&self,
|
||||
) -> Result<(
|
||||
JoinHandle<std::result::Result<(), tonic::transport::Error>>,
|
||||
GrpcShutdown,
|
||||
)> {
|
||||
let listen_addr = self.config.listen_addr;
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
|
||||
let svc = TelemetryStreamServer::new(TelemetryService::new(publisher));
|
||||
let join = tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(svc)
|
||||
.serve_with_shutdown(listen_addr, async move {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
Ok((
|
||||
join,
|
||||
GrpcShutdown {
|
||||
tx: Some(shutdown_tx),
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
/// Spawn the Tonic server bound on a specific `TcpListener`.
|
||||
/// Useful for tests that need to know the actual port ahead of
|
||||
/// time (bind to `127.0.0.1:0` then read the assigned port).
|
||||
pub fn spawn_grpc_server_on(
|
||||
&self,
|
||||
listener: std::net::TcpListener,
|
||||
) -> Result<(
|
||||
JoinHandle<std::result::Result<(), tonic::transport::Error>>,
|
||||
GrpcShutdown,
|
||||
)> {
|
||||
listener
|
||||
.set_nonblocking(true)
|
||||
.map_err(|e| AutopilotError::Internal(format!("set_nonblocking: {e}")))?;
|
||||
let tokio_listener = tokio::net::TcpListener::from_std(listener)
|
||||
.map_err(|e| AutopilotError::Internal(format!("TcpListener::from_std: {e}")))?;
|
||||
let stream = tokio_stream::wrappers::TcpListenerStream::new(tokio_listener);
|
||||
|
||||
let publisher = Arc::clone(&self.publisher);
|
||||
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
|
||||
let svc = TelemetryStreamServer::new(TelemetryService::new(publisher));
|
||||
|
||||
let join = tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(svc)
|
||||
.serve_with_incoming_shutdown(stream, async move {
|
||||
let _ = shutdown_rx.await;
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
Ok((
|
||||
join,
|
||||
GrpcShutdown {
|
||||
tx: Some(shutdown_tx),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII shutdown trigger for the spawned gRPC server. Drop the value
|
||||
/// or call `shutdown()` to stop the server.
|
||||
pub struct GrpcShutdown {
|
||||
tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
}
|
||||
|
||||
impl GrpcShutdown {
|
||||
pub fn shutdown(mut self) {
|
||||
if let Some(tx) = self.tx.take() {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GrpcShutdown {
|
||||
fn drop(&mut self) {
|
||||
if let Some(tx) = self.tx.take() {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TelemetryStreamHandle {
|
||||
publisher: Arc<TelemetryPublisher>,
|
||||
commands_tx: mpsc::Sender<OperatorCommand>,
|
||||
}
|
||||
|
||||
impl TelemetryStreamHandle {
|
||||
/// Inject an operator command. Production path is fed by the downlink
|
||||
/// receiver in `internal::downlink/*`; tests can call this directly.
|
||||
/// Publish a payload on `topic`. Never blocks the caller; slow
|
||||
/// subscribers experience drops accounted in [`snapshot`].
|
||||
pub fn publish<T: serde::Serialize>(
|
||||
&self,
|
||||
topic: TelemetryTopic,
|
||||
payload: &T,
|
||||
) -> std::result::Result<(), PublishError> {
|
||||
self.publisher.publish(topic, payload)
|
||||
}
|
||||
|
||||
/// Inject an operator command downlink. Production path is fed
|
||||
/// by the gRPC return half once AZ-678 lands; tests may call this
|
||||
/// directly.
|
||||
pub async fn submit_command(&self, command: OperatorCommand) -> Result<()> {
|
||||
self.commands_tx
|
||||
.send(command)
|
||||
@@ -59,8 +226,32 @@ impl TelemetryStreamHandle {
|
||||
.map_err(|_| AutopilotError::Internal("downlink channel closed".into()))
|
||||
}
|
||||
|
||||
pub fn snapshot(&self) -> PublisherSnapshot {
|
||||
self.publisher.snapshot()
|
||||
}
|
||||
|
||||
pub fn health(&self) -> ComponentHealth {
|
||||
ComponentHealth::disabled(NAME)
|
||||
let snap = self.publisher.snapshot();
|
||||
let mut h = ComponentHealth::green(NAME);
|
||||
|
||||
let hot_drops: Vec<_> = snap
|
||||
.drops_total
|
||||
.iter()
|
||||
.filter(|(_, &v)| v >= DROP_YELLOW_THRESHOLD)
|
||||
.collect();
|
||||
|
||||
let detail = format!(
|
||||
"subscribers={} published_total={} hot_drop_pairs={}",
|
||||
snap.subscribed_clients,
|
||||
snap.published_total,
|
||||
hot_drops.len()
|
||||
);
|
||||
|
||||
if !hot_drops.is_empty() {
|
||||
h.level = HealthLevel::Yellow;
|
||||
}
|
||||
h.detail = Some(detail);
|
||||
h
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,14 +259,14 @@ impl TelemetryStreamHandle {
|
||||
impl TelemetrySink for TelemetryStreamHandle {
|
||||
async fn push_frame(&self, _frame: Frame) -> Result<()> {
|
||||
Err(AutopilotError::NotImplemented(
|
||||
"telemetry_stream::push_frame (AZ-676)",
|
||||
"telemetry_stream::push_frame (AZ-676 video path)",
|
||||
))
|
||||
}
|
||||
|
||||
async fn push_detections(&self, _batch: DetectionBatch) -> Result<()> {
|
||||
Err(AutopilotError::NotImplemented(
|
||||
"telemetry_stream::push_detections (AZ-675)",
|
||||
))
|
||||
async fn push_detections(&self, batch: DetectionBatch) -> Result<()> {
|
||||
self.publisher
|
||||
.publish(Topic::DetectionEvent, &batch)
|
||||
.map_err(|e| AutopilotError::Internal(format!("publish detections: {e}")))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,8 +275,35 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_compiles() {
|
||||
let h = TelemetryStream::new(8).handle();
|
||||
assert_eq!(h.health().level, shared::health::HealthLevel::Disabled);
|
||||
fn handle_starts_with_zero_subscribers_and_green_health() {
|
||||
// Arrange
|
||||
let s = TelemetryStream::new(8);
|
||||
let h = s.handle();
|
||||
|
||||
// Act
|
||||
let snap = h.snapshot();
|
||||
let health = h.health();
|
||||
|
||||
// Assert
|
||||
assert_eq!(snap.subscribed_clients, 0);
|
||||
assert_eq!(snap.published_total, 0);
|
||||
assert_eq!(health.level, HealthLevel::Green);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn publish_without_subscribers_is_no_op_but_counts() {
|
||||
// Arrange
|
||||
let s = TelemetryStream::new(8);
|
||||
let h = s.handle();
|
||||
|
||||
// Act
|
||||
h.publish(
|
||||
TelemetryTopic::TelemetrySample,
|
||||
&serde_json::json!({"v": 1}),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Assert
|
||||
assert_eq!(h.snapshot().per_topic[&Topic::TelemetrySample].published, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user