Files
autopilot/crates/gimbal_controller/src/internal/transport.rs
T
Oleksandr Bezdieniezhnykh 288e7f8c46
ci/woodpecker/push/build-arm Pipeline failed
[AZ-653] gimbal_controller ViewPro A40 vendor UDP transport (batch 10)
Implements the vendor wire protocol for the A40 gimbal (XOR-8 checksum,
not CRC16 — task spec corrected against ArduPilot AP_Mount_Viewpro.h):
frame encode/decode, typed FrameId/CameraCommand/ImageSensor, A1 angles,
C1 camera, C2 set-zoom command builders, and a tokio UdpSocket transport
with bounded retry, per-command deadline, and atomic vendor-fault
counters surfaced via faults()/health(). GimbalControllerHandle::set_pose
and zoom now ride the transport when wired; remain disabled when no
transport is bound. 32/32 gimbal_controller tests green; workspace test
suite green except for a pre-existing flake in
mission_executor::state_machine::ac3_bounded_retry_then_success that
reproduces only under parallel workspace test load (passes 5/5 in
isolation; flagged in batch 8 report, unrelated to this batch).

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-19 20:07:32 +03:00

331 lines
12 KiB
Rust

//! UDP transport for the ViewPro A40.
//!
//! Owns the [`UdpSocket`], the rolling frame counter, the bounded
//! retry policy, and the vendor-fault counters that feed the
//! component's health surface. Inbound frames are checksum-validated
//! by [`super::a40_protocol::decode_frame`]; mismatches are counted
//! as `vendor_faults_total{kind="crc"}` and dropped.
//!
//! The transport is **command/response** keyed by `(FrameId, frame_counter)`:
//! each `send_with_response` issues a frame, awaits the next
//! matching inbound frame within a per-command deadline, and retries
//! up to `max_retries` on timeout. Unmatched inbound frames (e.g.
//! the gimbal's HEARTBEAT) are still surfaced through the
//! broadcast stream so a future telemetry pump can consume them.
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::UdpSocket;
use tokio::sync::{broadcast, Mutex};
use tokio::task::JoinHandle;
use tokio::time::{timeout, Instant};
use super::a40_protocol::frame::{decode_frame, encode_frame, Frame, FrameDecodeError, FrameId};
/// Default per-command response deadline. The NFR is ≤200 ms on a
/// healthy link; 150 ms leaves headroom for the bounded-retry budget.
pub const DEFAULT_COMMAND_DEADLINE: Duration = Duration::from_millis(150);
/// Default retry budget for `send_with_response`. Vendor link is
/// best-effort UDP; bounded retries match the AZ-651 ladder pattern.
pub const DEFAULT_MAX_RETRIES: u8 = 3;
/// Broadcast channel capacity for inbound frames. Slow consumers
/// see `Lagged`; the transport itself is unaffected.
pub const INBOUND_CHANNEL_CAPACITY: usize = 64;
/// Counters surfaced through `health()`. Tracked atomically by the
/// transport; readers see a coherent snapshot via the public
/// getters.
#[derive(Debug, Default)]
pub struct VendorFaults {
/// Inbound frames that failed checksum / framing validation.
pub crc: std::sync::atomic::AtomicU64,
/// Outbound commands that exhausted their retry budget without a
/// matching response.
pub timeout: std::sync::atomic::AtomicU64,
/// Inbound frames whose `FrameId` could not be decoded.
pub unknown_frame_id: std::sync::atomic::AtomicU64,
}
impl VendorFaults {
fn inc_crc(&self) {
self.crc.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
fn inc_timeout(&self) {
self.timeout
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
fn inc_unknown_frame_id(&self) {
self.unknown_frame_id
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
pub fn snapshot(&self) -> VendorFaultsSnapshot {
VendorFaultsSnapshot {
crc: self.crc.load(std::sync::atomic::Ordering::Relaxed),
timeout: self.timeout.load(std::sync::atomic::Ordering::Relaxed),
unknown_frame_id: self
.unknown_frame_id
.load(std::sync::atomic::Ordering::Relaxed),
}
}
}
/// Read-side snapshot of [`VendorFaults`].
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct VendorFaultsSnapshot {
pub crc: u64,
pub timeout: u64,
pub unknown_frame_id: u64,
}
#[derive(Debug, thiserror::Error)]
pub enum A40Error {
#[error("frame too large for vendor protocol (max body 63 bytes)")]
FrameTooLarge,
#[error("max retries exceeded ({attempts} attempts) waiting for {expected:?}")]
MaxRetriesExceeded { attempts: u8, expected: FrameId },
#[error("UDP I/O: {0}")]
Io(#[from] std::io::Error),
#[error("inbound broadcast channel closed")]
InboundChannelClosed,
}
/// UDP transport for the A40. Cheap to clone — both the socket and
/// the inbound broadcast sender are wrapped in `Arc`.
#[derive(Clone)]
pub struct A40Transport {
socket: Arc<UdpSocket>,
peer: SocketAddr,
inbound_tx: broadcast::Sender<Frame>,
faults: Arc<VendorFaults>,
frame_counter: Arc<Mutex<u8>>,
command_deadline: Duration,
max_retries: u8,
}
impl A40Transport {
/// Build a transport bound to a local UDP port and pre-connected
/// to `peer`. The receive task is spawned and returned alongside
/// the transport so the caller owns the join handle.
pub async fn bind(
local: SocketAddr,
peer: SocketAddr,
) -> Result<(Self, JoinHandle<()>), A40Error> {
let socket = UdpSocket::bind(local).await?;
socket.connect(peer).await?;
Self::from_socket(Arc::new(socket), peer)
}
/// Construct a transport directly from a pre-bound socket. Used
/// by tests that need to control both endpoints.
pub fn from_socket(
socket: Arc<UdpSocket>,
peer: SocketAddr,
) -> Result<(Self, JoinHandle<()>), A40Error> {
let (inbound_tx, _rx) = broadcast::channel::<Frame>(INBOUND_CHANNEL_CAPACITY);
let faults = Arc::new(VendorFaults::default());
let transport = Self {
socket: socket.clone(),
peer,
inbound_tx: inbound_tx.clone(),
faults: faults.clone(),
frame_counter: Arc::new(Mutex::new(0)),
command_deadline: DEFAULT_COMMAND_DEADLINE,
max_retries: DEFAULT_MAX_RETRIES,
};
let recv_task = tokio::spawn(receive_loop(socket, inbound_tx, faults));
Ok((transport, recv_task))
}
pub fn with_command_deadline(mut self, deadline: Duration) -> Self {
self.command_deadline = deadline;
self
}
pub fn with_max_retries(mut self, retries: u8) -> Self {
self.max_retries = retries;
self
}
/// Subscribe to inbound frames. Receivers that lag past the
/// channel capacity see `RecvError::Lagged` and are responsible
/// for resyncing.
pub fn subscribe_inbound(&self) -> broadcast::Receiver<Frame> {
self.inbound_tx.subscribe()
}
pub fn faults(&self) -> VendorFaultsSnapshot {
self.faults.snapshot()
}
/// Send a fire-and-forget frame; no response is awaited and no
/// retry is performed. Use for outbound packets the vendor does
/// not acknowledge (e.g. `M_AHRS` attitude pushes).
pub async fn send_oneway(&self, frame_id: FrameId, data: &[u8]) -> Result<(), A40Error> {
let counter = self.next_counter().await;
let bytes = encode_frame(frame_id, data, counter).ok_or(A40Error::FrameTooLarge)?;
self.socket.send(&bytes).await?;
Ok(())
}
/// Send a frame and await the first inbound frame whose
/// `FrameId` matches `expected_reply` within the per-command
/// deadline. Retries up to `max_retries` times on timeout;
/// returns `Err(MaxRetriesExceeded)` on cap exhaustion.
///
/// Inbound frames with non-matching ids are still broadcast to
/// subscribers; they just don't satisfy *this* call.
pub async fn send_with_response(
&self,
frame_id: FrameId,
data: &[u8],
expected_reply: FrameId,
) -> Result<Frame, A40Error> {
let bytes_template = {
// Re-encode per attempt because the counter increments;
// do one bounds check up-front so we never enter the
// retry loop with a doomed frame.
let probe_counter = 0u8;
encode_frame(frame_id, data, probe_counter).ok_or(A40Error::FrameTooLarge)?
};
// Use `bytes_template` purely as a size validator above; the
// counter we actually use is fresh per attempt.
drop(bytes_template);
let mut inbound_rx = self.inbound_tx.subscribe();
let deadline = self.command_deadline;
let max_retries = self.max_retries.max(1);
let mut attempts: u8 = 0;
while attempts < max_retries {
attempts += 1;
let counter = self.next_counter().await;
let bytes = encode_frame(frame_id, data, counter).ok_or(A40Error::FrameTooLarge)?;
self.socket.send(&bytes).await?;
// Await the next matching inbound frame within the
// deadline. We re-loop on non-matching frames so the
// gimbal's HEARTBEAT etc. doesn't cancel our wait.
let started = Instant::now();
loop {
let remaining = deadline.saturating_sub(started.elapsed());
if remaining.is_zero() {
break;
}
match timeout(remaining, inbound_rx.recv()).await {
Ok(Ok(frame)) if frame.frame_id == expected_reply => {
return Ok(frame);
}
Ok(Ok(_other)) => continue,
Ok(Err(broadcast::error::RecvError::Lagged(_))) => {
// We may have missed the reply; treat as
// timeout for this attempt rather than
// hanging.
break;
}
Ok(Err(broadcast::error::RecvError::Closed)) => {
return Err(A40Error::InboundChannelClosed);
}
Err(_elapsed) => break, // timed out
}
}
self.faults.inc_timeout();
tracing::warn!(
attempts,
max_retries,
?frame_id,
?expected_reply,
"A40 command timeout; retrying"
);
}
Err(A40Error::MaxRetriesExceeded {
attempts,
expected: expected_reply,
})
}
pub fn peer(&self) -> SocketAddr {
self.peer
}
async fn next_counter(&self) -> u8 {
let mut c = self.frame_counter.lock().await;
let v = *c;
*c = (*c).wrapping_add(1) & 0b11;
v
}
}
async fn receive_loop(
socket: Arc<UdpSocket>,
inbound_tx: broadcast::Sender<Frame>,
faults: Arc<VendorFaults>,
) {
// Vendor packet ceiling is 63 bytes; round up to 128 for safety.
let mut buf = [0u8; 128];
loop {
match socket.recv(&mut buf).await {
Ok(len) => match decode_frame(&buf[..len]) {
Ok(frame) => {
let _ = inbound_tx.send(frame);
}
Err(FrameDecodeError::BadChecksum { .. }) => {
faults.inc_crc();
tracing::debug!("A40 inbound checksum mismatch; dropping frame");
}
Err(FrameDecodeError::UnknownFrameId(_)) => {
faults.inc_unknown_frame_id();
}
Err(e) => {
// Other framing errors share the crc counter
// (they are all "frame envelope invalid" faults
// from the operator's perspective).
faults.inc_crc();
tracing::debug!(error=?e, "A40 inbound frame rejected");
}
},
Err(e) => {
tracing::error!(error=%e, "A40 transport recv error; shutting down receive loop");
return;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn faults_default_zero() {
// Arrange + Act
let f = VendorFaults::default();
// Assert
let s = f.snapshot();
assert_eq!(s.crc, 0);
assert_eq!(s.timeout, 0);
assert_eq!(s.unknown_frame_id, 0);
}
#[test]
fn faults_counters_increment_independently() {
// Arrange
let f = VendorFaults::default();
// Act
f.inc_crc();
f.inc_crc();
f.inc_timeout();
// Assert
let s = f.snapshot();
assert_eq!(s.crc, 2);
assert_eq!(s.timeout, 1);
assert_eq!(s.unknown_frame_id, 0);
}
}