mirror of
https://github.com/azaion/autopilot.git
synced 2026-06-21 12:41:10 +00:00
8a4bd00526
AZ-650 (mission_executor pre-flight Built-In Test):
- BitEvaluator trait + BitItemStatus { Pass, Degraded, Fail, Skipped }
+ BitReport + BitOverall fusion. Pluggable per-item evaluators so
the composition root decides which dependencies are wired today.
- BitController owns evaluator list + mpsc ack channel + sticky-pass
+ ack deadline. Publishes bit_ok via tokio watch — composition root
pipes it into the telemetry projection where the existing FSM
bit_ok guard already consumes it (no FSM changes needed).
- BitState { Idle, Pass, AwaitingAck { report_id }, Failed { reason } }
with broadcast::Sender<BitEvent> for operator-side observability.
Sticky-pass semantics: once Pass is reached (directly or via signed
ack on a Degraded report), the controller stops re-evaluating —
BIT is a one-shot pre-flight gate, not a continuous monitor.
- BitDegradedAck arrives pre-validated by operator_bridge; the
controller only matches report_id and applies the operator id to
the audit log.
- Concrete evaluators landed today (3 of 12 spec items, the rest
depend on components still in todo/):
- StateDirFreeSpaceEvaluator (dir creatable/readable; statvfs is
documented follow-up).
- WallClockBoundEvaluator (chrono::Utc::now vs configurable bound).
- MissionLoadedEvaluator (waypoint count via Arc<Mutex<usize>>).
- MapObjectsSyncedEvaluator (maps SyncState -> BIT status per Q9).
Tests:
- ac1_all_pass_proceeds, ac2_fail_blocks_transition,
ac3_degraded_requires_signed_ack (+ mismatched_ack supplement),
ac4_degraded_ack_timeout_fails_the_bit — all 4 ACs green.
- Pure next_state table covered by lib unit tests.
- Per-evaluator unit tests for Pass/Fail/Degraded branches.
Quality gates:
- cargo fmt: clean.
- cargo clippy -p mission_executor --tests -- -D warnings: 0 warns.
- cargo test --workspace: all green.
- Pre-existing flake in state_machine::ac3_bounded_retry_then_success
(batch 7 report) remains pre-existing — passes on rerun.
Co-authored-by: Cursor <cursoragent@cursor.com>
219 lines
8.4 KiB
Rust
219 lines
8.4 KiB
Rust
//! AZ-668 — persistence trait + default JSON snapshot engine.
|
|
//!
|
|
//! Default engine per Q3: in-memory + atomic JSON snapshot. The trait
|
|
//! is kept narrow on purpose so a future SQLite+H3 / RocksDB engine
|
|
//! can swap in without touching call sites.
|
|
//!
|
|
//! Crash-safety: writes go to `${state_dir}/mapobjects/<mission_id>.json.tmp`,
|
|
//! are fsync'd, then atomically renamed onto the final path. The parent
|
|
//! directory is fsync'd after the rename so the rename itself survives
|
|
//! a power loss. Interrupted writes leave the `.tmp` file behind; the
|
|
//! next `load_snapshot` ignores it.
|
|
//!
|
|
//! Corruption surfaces as [`PersistenceError::Corrupt`]: the caller MUST
|
|
//! refuse to start with stale state and propagate the error to the
|
|
//! operator (AZ-668 AC-4). The engine does NOT silently fall back to
|
|
//! an empty store.
|
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
use async_trait::async_trait;
|
|
use thiserror::Error;
|
|
use tokio::sync::Mutex as AsyncMutex;
|
|
use tokio::{fs, io::AsyncWriteExt};
|
|
|
|
use super::snapshot::Snapshot;
|
|
|
|
/// Errors surfaced by [`MapObjectsPersistence`].
|
|
#[derive(Debug, Error)]
|
|
pub enum PersistenceError {
|
|
#[error("persistence I/O error: {0}")]
|
|
Io(#[from] std::io::Error),
|
|
/// The snapshot file was present but unreadable. The caller MUST
|
|
/// refuse to start with stale state and surface the error to the
|
|
/// operator — never silently start empty (AZ-668 AC-4).
|
|
#[error("snapshot corrupt at {path}: {reason}")]
|
|
Corrupt { path: PathBuf, reason: String },
|
|
/// Schema version mismatch — the on-disk blob predates the running
|
|
/// binary. Treated as corruption (operator must reconcile).
|
|
#[error("snapshot schema mismatch at {path}: expected {expected}, found {found}")]
|
|
SchemaMismatch {
|
|
path: PathBuf,
|
|
expected: u32,
|
|
found: u32,
|
|
},
|
|
}
|
|
|
|
/// Engine-level metrics surfaced to the health aggregator.
|
|
/// Per AZ-668 §Outcome: `last_snapshot_ts`, `snapshot_size_bytes`,
|
|
/// `snapshot_errors_total`.
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct PersistenceMetrics {
|
|
pub last_snapshot_ts: Option<chrono::DateTime<chrono::Utc>>,
|
|
pub snapshot_size_bytes: Option<u64>,
|
|
pub snapshot_errors_total: u64,
|
|
}
|
|
|
|
/// Pluggable persistence backend. The default impl is the JSON
|
|
/// snapshot engine (below); future Q3 engines (SQLite+H3, RocksDB, …)
|
|
/// implement this trait without breaking call sites.
|
|
///
|
|
/// Methods are `async` because file I/O on the Jetson can stall while
|
|
/// the SD card is busy with detection-evidence writes; blocking the
|
|
/// runtime worker thread would starve `mavlink_layer`'s heartbeat
|
|
/// task. Implementations that do nothing async can delegate to
|
|
/// `tokio::task::spawn_blocking`.
|
|
#[async_trait]
|
|
pub trait MapObjectsPersistence: Send + Sync {
|
|
/// Atomically persist `snapshot` keyed by its `mission_id`.
|
|
/// Implementations MUST guarantee no partial writes are visible to
|
|
/// `load_snapshot` — typically by writing to a `.tmp` sibling then
|
|
/// renaming.
|
|
async fn save_snapshot(&self, snapshot: &Snapshot) -> Result<(), PersistenceError>;
|
|
|
|
/// Load the most recent snapshot for `mission_id`. Returns
|
|
/// `Ok(None)` if no snapshot exists; `Err(Corrupt)` on a present
|
|
/// but unreadable blob (the caller MUST refuse to start).
|
|
async fn load_snapshot(&self, mission_id: &str) -> Result<Option<Snapshot>, PersistenceError>;
|
|
|
|
/// Engine metrics for the health surface.
|
|
fn metrics(&self) -> PersistenceMetrics;
|
|
}
|
|
|
|
/// Default Q3 engine: one JSON file per mission, atomic-renamed on
|
|
/// each write.
|
|
///
|
|
/// Path layout: `${state_dir}/mapobjects/<mission_id>.json`. The
|
|
/// `mapobjects` subdirectory is created on first write.
|
|
pub struct JsonSnapshotEngine {
|
|
state_dir: PathBuf,
|
|
metrics: AsyncMutex<PersistenceMetrics>,
|
|
}
|
|
|
|
impl std::fmt::Debug for JsonSnapshotEngine {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_struct("JsonSnapshotEngine")
|
|
.field("state_dir", &self.state_dir)
|
|
.finish_non_exhaustive()
|
|
}
|
|
}
|
|
|
|
impl JsonSnapshotEngine {
|
|
/// Construct an engine rooted at `state_dir`. The directory does
|
|
/// not have to exist yet — it is created lazily on the first
|
|
/// successful `save_snapshot`.
|
|
pub fn new(state_dir: impl Into<PathBuf>) -> Self {
|
|
Self {
|
|
state_dir: state_dir.into(),
|
|
metrics: AsyncMutex::new(PersistenceMetrics::default()),
|
|
}
|
|
}
|
|
|
|
/// Resolve the canonical snapshot path for `mission_id`.
|
|
///
|
|
/// `mission_id` is treated as an opaque filename component. Callers
|
|
/// supply trusted ids from the central API; no path traversal
|
|
/// sanitisation is performed (the AZ-668 spec does not require it).
|
|
/// If untrusted ids ever flow in, add validation here.
|
|
pub fn snapshot_path(&self, mission_id: &str) -> PathBuf {
|
|
self.state_dir
|
|
.join("mapobjects")
|
|
.join(format!("{mission_id}.json"))
|
|
}
|
|
|
|
fn tmp_path(&self, mission_id: &str) -> PathBuf {
|
|
self.state_dir
|
|
.join("mapobjects")
|
|
.join(format!("{mission_id}.json.tmp"))
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl MapObjectsPersistence for JsonSnapshotEngine {
|
|
async fn save_snapshot(&self, snapshot: &Snapshot) -> Result<(), PersistenceError> {
|
|
let outcome = self.save_snapshot_inner(snapshot).await;
|
|
if outcome.is_err() {
|
|
let mut m = self.metrics.lock().await;
|
|
m.snapshot_errors_total = m.snapshot_errors_total.saturating_add(1);
|
|
}
|
|
outcome
|
|
}
|
|
|
|
async fn load_snapshot(&self, mission_id: &str) -> Result<Option<Snapshot>, PersistenceError> {
|
|
let path = self.snapshot_path(mission_id);
|
|
let outcome = self.load_snapshot_inner(&path).await;
|
|
if matches!(
|
|
outcome,
|
|
Err(PersistenceError::Corrupt { .. } | PersistenceError::SchemaMismatch { .. })
|
|
) {
|
|
let mut m = self.metrics.lock().await;
|
|
m.snapshot_errors_total = m.snapshot_errors_total.saturating_add(1);
|
|
}
|
|
outcome
|
|
}
|
|
|
|
fn metrics(&self) -> PersistenceMetrics {
|
|
// Cheap snapshot under a non-async borrow — `try_lock` keeps the
|
|
// health surface non-blocking; if the lock is contended we
|
|
// return zeros rather than parking the health caller.
|
|
self.metrics
|
|
.try_lock()
|
|
.map(|m| m.clone())
|
|
.unwrap_or_default()
|
|
}
|
|
}
|
|
|
|
impl JsonSnapshotEngine {
|
|
async fn save_snapshot_inner(&self, snapshot: &Snapshot) -> Result<(), PersistenceError> {
|
|
let path = self.snapshot_path(&snapshot.mission_id);
|
|
let tmp = self.tmp_path(&snapshot.mission_id);
|
|
let dir = path.parent().expect("snapshot path always has parent");
|
|
|
|
fs::create_dir_all(dir).await?;
|
|
let bytes = serde_json::to_vec(snapshot).map_err(|e| PersistenceError::Corrupt {
|
|
path: path.clone(),
|
|
reason: format!("serialize: {e}"),
|
|
})?;
|
|
let size = bytes.len() as u64;
|
|
|
|
{
|
|
let mut f = fs::File::create(&tmp).await?;
|
|
f.write_all(&bytes).await?;
|
|
f.sync_all().await?;
|
|
}
|
|
fs::rename(&tmp, &path).await?;
|
|
// Best-effort parent fsync so the rename survives a power
|
|
// loss. POSIX guarantees this is the durability anchor for
|
|
// directory operations; non-POSIX platforms ignore.
|
|
if let Ok(dir_handle) = std::fs::File::open(dir) {
|
|
let _ = dir_handle.sync_all();
|
|
}
|
|
|
|
let mut m = self.metrics.lock().await;
|
|
m.last_snapshot_ts = Some(chrono::Utc::now());
|
|
m.snapshot_size_bytes = Some(size);
|
|
Ok(())
|
|
}
|
|
|
|
async fn load_snapshot_inner(&self, path: &Path) -> Result<Option<Snapshot>, PersistenceError> {
|
|
let bytes = match fs::read(path).await {
|
|
Ok(b) => b,
|
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
|
|
Err(e) => return Err(e.into()),
|
|
};
|
|
let snapshot: Snapshot =
|
|
serde_json::from_slice(&bytes).map_err(|e| PersistenceError::Corrupt {
|
|
path: path.to_path_buf(),
|
|
reason: format!("deserialize: {e}"),
|
|
})?;
|
|
if snapshot.schema_version != Snapshot::CURRENT_SCHEMA_VERSION {
|
|
return Err(PersistenceError::SchemaMismatch {
|
|
path: path.to_path_buf(),
|
|
expected: Snapshot::CURRENT_SCHEMA_VERSION,
|
|
found: snapshot.schema_version,
|
|
});
|
|
}
|
|
Ok(Some(snapshot))
|
|
}
|
|
}
|