diff --git a/Cargo.lock b/Cargo.lock index 924c1cc..5ae691f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -538,6 +538,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", ] [[package]] @@ -877,6 +878,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "http" version = "1.4.0" @@ -1597,11 +1607,18 @@ name = "operator_bridge" version = "0.1.0" dependencies = [ "async-trait", + "chrono", + "hmac", "mapobjects_store", + "parking_lot", "serde", + "serde_json", + "sha2", "shared", + "thiserror 1.0.69", "tokio", "tracing", + "uuid", ] [[package]] @@ -2387,6 +2404,7 @@ name = "telemetry_stream" version = "0.1.0" dependencies = [ "async-trait", + "bytes", "chrono", "parking_lot", "prost", diff --git a/Cargo.toml b/Cargo.toml index 267b2c6..c1f5ab1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,6 +76,7 @@ parking_lot = "0.12" # Crypto / hashing sha2 = "0.10" +hmac = "0.12" # Wire encoding (VLM IPC) base64 = "0.22" diff --git a/_docs/02_tasks/todo/AZ-676_telemetry_stream_video_path.md b/_docs/02_tasks/done/AZ-676_telemetry_stream_video_path.md similarity index 100% rename from _docs/02_tasks/todo/AZ-676_telemetry_stream_video_path.md rename to _docs/02_tasks/done/AZ-676_telemetry_stream_video_path.md diff --git a/_docs/02_tasks/todo/AZ-677_telemetry_stream_mapobjects_snapshot.md b/_docs/02_tasks/done/AZ-677_telemetry_stream_mapobjects_snapshot.md similarity index 100% rename from _docs/02_tasks/todo/AZ-677_telemetry_stream_mapobjects_snapshot.md rename to _docs/02_tasks/done/AZ-677_telemetry_stream_mapobjects_snapshot.md diff --git a/_docs/02_tasks/todo/AZ-678_operator_bridge_command_auth.md b/_docs/02_tasks/done/AZ-678_operator_bridge_command_auth.md similarity index 100% rename from _docs/02_tasks/todo/AZ-678_operator_bridge_command_auth.md rename to _docs/02_tasks/done/AZ-678_operator_bridge_command_auth.md diff --git a/_docs/02_tasks/todo/AZ-679_operator_bridge_poi_surface.md b/_docs/02_tasks/done/AZ-679_operator_bridge_poi_surface.md similarity index 100% rename from _docs/02_tasks/todo/AZ-679_operator_bridge_poi_surface.md rename to _docs/02_tasks/done/AZ-679_operator_bridge_poi_surface.md diff --git a/_docs/03_implementation/batch_15_cycle1_report.md b/_docs/03_implementation/batch_15_cycle1_report.md new file mode 100644 index 0000000..c021bde --- /dev/null +++ b/_docs/03_implementation/batch_15_cycle1_report.md @@ -0,0 +1,195 @@ +# Batch 15 / Cycle 1 — Implementation Report + +**Date**: 2026-05-20 +**Tasks**: AZ-676, AZ-677, AZ-678, AZ-679 +**Verdict**: PASS_WITH_WARNINGS +- Pre-existing autopilot dead-code warning still open (C5; not touched by this batch). +- Pre-existing `mission_executor::state_machine::ac3_bounded_retry_then_success` flake still intermittent under workspace test load (C6; not touched by this batch). +- New optional surface in `OperatorBridge` (telemetry sink wiring) is gated by `with_telemetry_sink` / `with_validator` constructors — composition root in `crates/autopilot` will wire them in a future ticket (AZ-680 dispatch). + +## 1. Scope + +| Ticket | Title | Crate | Complexity | +|---|---|---|---| +| AZ-676 | telemetry_stream video path (rtsp_forward + bytes_inline) + ai_locked | `telemetry_stream` | 3 | +| AZ-677 | telemetry_stream MapObjects snapshot + diffs + reconnect resync | `telemetry_stream` | 3 | +| AZ-678 | operator_bridge command authentication (HMAC, replay, session) | `operator_bridge` | 5 | +| AZ-679 | operator_bridge POI surface mapper + dequeue + deadline carriage | `operator_bridge` | 3 | + +Batch chosen explicitly for **Telemetry+Operator foundation cohesion** — all four tickets sit on top of AZ-675 (gRPC server, shipped in batch 14) and AZ-667 (mapobjects_store hydrate, prior). AZ-676 closes the video transport question for the operator side; AZ-677 closes the MapObjects-bundle transport pattern; AZ-678 lays down the authentication invariants every command will cross; AZ-679 produces the wire-format POI events the GS UI consumes. Subsequent operator-side work (AZ-680 dispatch, AZ-681 safety/BIT ACK, AZ-684 VLM label) plugs into these four contracts. + +`AZ-658` (frame_ingest decoder, 5 pts) and `AZ-668` (scan_controller queue) remained unblocked but were deliberately deferred: AZ-658 has an open H.264-binding decision the team hasn't committed to (retina vs ffmpeg-rs vs gstreamer; cf. cumulative C7-adjacent risk), and AZ-668 is better picked up as part of the next scan_controller batch where its consumer surface lands. + +## 2. Approach + +### AZ-676 — Video path + +Two delivery modes named in the task spec map to a `VideoPath` enum (`RtspForward { url }` / `BytesInline { … }`) on the runtime, and to a single SubscribeVideo RPC on the wire. The session-start contract was promoted into its own proto message (`VideoSessionStart`) so the client can branch on `oneof` without re-reading config. + +**ai_locked coordination** is a single `Arc` owned by the `VideoPublisher`; session register / deregister flips it under a counter so concurrent subscribers don't toggle it back and forth. Consumers (`frame_ingest` AZ-657 already done; `detection_client` AZ-660) read the flag via `TelemetryStreamHandle::ai_locked_handle()` — no cross-crate observer registration, just a shared atomic. + +The `bytes_inline` path uses the same `tokio::sync::broadcast` machinery as the telemetry topics (lossy ring buffer, per-client drop counters). The `rtsp_forward` path is a no-op for `push_frame` — `frame_ingest` keeps calling without branching on configuration, the publisher decides. + +### AZ-677 — MapObjects snapshot + diff + +The contract added is `MapObjectsSnapshotSource` (a trait `telemetry_stream` calls into; the production implementation will be `mapobjects_store::Store` via a thin adapter — not yet wired, lives in `EmptyMapObjectsSource` fixture for now). The wire format is a tagged enum `MapObjectsTopicMessage::{ Snapshot, Diff }` so the operator UI can branch deterministically. + +**Snapshot-on-subscribe** is implemented via a `StartThen` stream combinator inside the gRPC `subscribe` handler: when the requested topic list includes `MapObjectsBundle`, we synchronously call `current_snapshot_message()` and prepend it to the broadcast stream. **Reconnect** therefore Just Works — a new subscribe is a new snapshot, no replay state to manage. + +**Diff fan-out** uses the existing publisher: `TelemetryStreamHandle::push_mapobjects_diff(diff)` serialises and publishes on `Topic::MapObjectsBundle`. The wire enum tag (`kind: snapshot | diff`) keeps both message types on the same topic. + +### AZ-678 — Command authentication + +The contract `OperatorCommandValidator` + types (`SignedCommand`, `ValidatedCommand`, `AuthError`) lives in `shared::contracts::operator_auth` so dispatch callsites (`scan_controller`, `mission_executor`) can depend on the trait without importing `operator_bridge` — a layering invariant the architecture deliberately preserves. + +The default implementation `HmacOperatorValidator` (`operator_bridge::internal::auth`) is intentionally narrow: + +- HMAC-SHA256 over `(session_token || '|' || seq_be || '|' || canonical_payload_json)`. The separator byte prevents length-extension between the three fields; canonical JSON is `serde_json::to_vec` of the `serde_json::Value` (deterministic for the operator's signing side). +- Constant-time compare via `hmac::Mac::verify_slice` (no timing oracle, per NFR-Security). +- Per-session replay tracker — `last_seen_seq: Option` advances on Ok, never on rejection. Rejecting `seq=N` does not poison the session: a legitimate retry can still land with `N+1`. This was the subtlety that drove the explicit AC-2 + AC-3 tests. +- Session registry is in-process `HashMap` keyed by an opaque token. `register_session(token, secret)` is called from the (out-of-scope) Ground Station handshake; revoke + TTL (default 30 min) are first-class. +- Rejection counters under a fixed-shape `AuthCounters` array (one slot per `REJECTION_REASONS`), exposed to the health surface. +- **Health-red gate**: sliding-window VecDeque of signature-failure timestamps over the trailing 60 s; once ≥ `signature_failure_red_threshold` (default 30/min) the health surface goes red. Pruning is amortised O(1) on every record + every health probe. + +### AZ-679 — POI surface + +The wire shape is the canonical model `shared::models::operator_event::OperatorPoiEvent` (matches `architecture.md §7.10`). `PoiSurfaceMapper::map(&poi, photo_metadata)` is a pure transform; `surface(&poi, photo_metadata)` is map + push through the `TelemetrySink::push_operator_event` extension. `emit_dequeued(poi_id, reason)` produces a `PoiDequeued` event. Both flow over a new `Topic::OperatorEvent` channel; the wire payload is a tagged enum (`OperatorEvent::{ PoiSurfaced, PoiDequeued }` with serde tag `kind`). + +`vlm_label` is intentionally `None` for now — the `Poi` model carries `vlm_status` (the pipeline status) but not the assistant-label string. The label will be threaded through in AZ-684 when scan_controller's VLM assessment ladder lands; the wire field is already in place so the operator UI can render it without a future schema change. + +`PoiSurfaceMetrics` exposes `pois_surfaced_per_min` (sliding 60 s window) + cumulative totals. Health is green by default; goes red only when the validator's signature-failure window crosses threshold (AC-5 via AZ-678). + +### Cross-crate wiring + +- `TelemetrySink` (in `shared::contracts`) gained `push_operator_event(OperatorEvent) -> Result<()>`. Only `telemetry_stream::TelemetryStreamHandle` implements `TelemetrySink`; production code already constructs the handle in the composition root, so the new method is wired automatically once batch 15 lands. +- `OperatorBridge` got two optional builder methods, `with_telemetry_sink(Arc)` and `with_validator(Arc)`. Existing call sites (tests, partial scaffolding in autopilot/runtime.rs) keep compiling. The composition-root wiring (autopilot/runtime.rs) is left for AZ-680 since dispatch + sink + validator are most naturally bundled. + +## 3. Files touched + +### Production + +- `Cargo.toml` — `hmac = "0.12"` workspace dep. +- `crates/shared/src/models/operator_event.rs` — **new**. `Tier2EvidenceSummary`, `PhotoMetadata`, `OperatorPoiEvent`, `DequeueReason`, `PoiDequeued`, `OperatorEvent`. +- `crates/shared/src/models/mod.rs` — `pub mod operator_event;`. +- `crates/shared/src/contracts/operator_auth.rs` — **new**. `SignedCommand`, `ValidatedCommand`, `AuthError`, `OperatorCommandValidator` trait. +- `crates/shared/src/contracts/mod.rs` — `pub mod operator_auth;` + `TelemetrySink::push_operator_event`. +- `crates/telemetry_stream/Cargo.toml` — `bytes` dep. +- `crates/telemetry_stream/proto/telemetry.proto` — `Topic::OperatorEvent`; `SubscribeVideo` RPC + supporting messages. +- `crates/telemetry_stream/src/internal/mod.rs` — `pub mod {mapobjects, video, video_server};`. +- `crates/telemetry_stream/src/internal/mapobjects.rs` — **new**. Snapshot + diff types, `MapObjectsSnapshotSource` trait, `EmptyMapObjectsSource` fixture. +- `crates/telemetry_stream/src/internal/video.rs` — **new**. `VideoPath`, `VideoFrameMessage`, `VideoSnapshot`, `VideoPublisher` (with ai_locked atomic + session counter). +- `crates/telemetry_stream/src/internal/video_server.rs` — **new**. SubscribeVideo RPC handler. +- `crates/telemetry_stream/src/internal/publisher.rs` — `OperatorEvent` topic added to `ALL_TOPICS`; snapshot/diff source + counters wired. +- `crates/telemetry_stream/src/internal/server.rs` — gRPC `subscribe_video` delegate; `subscribe` snapshot-prepend on `MapObjectsBundle`. +- `crates/telemetry_stream/src/lib.rs` — `TelemetryStreamConfig` video knobs; `VideoPublisher` construction; `ai_locked_handle`; `set_mapobjects_snapshot_source`; `push_mapobjects_diff`; `video_snapshot`; `TelemetrySink::push_frame` + `push_operator_event` impls. +- `crates/operator_bridge/Cargo.toml` — `serde_json`, `parking_lot`, `chrono`, `uuid`, `hmac`, `sha2`, `thiserror`. +- `crates/operator_bridge/src/internal/mod.rs` — `pub mod {auth, poi_surface};`. +- `crates/operator_bridge/src/internal/auth.rs` — **new**. `HmacValidatorConfig`, `HmacOperatorValidator`, `AuthCounters`, `REJECTION_REASONS`, session registry, replay tracker, health-red sliding window. +- `crates/operator_bridge/src/internal/poi_surface.rs` — **new**. `PoiSurfaceMapper`, `PoiSurfaceMetrics`, `SurfaceRateWindow`. +- `crates/operator_bridge/src/lib.rs` — `with_telemetry_sink`, `with_validator`, `surface_poi`, `surface_poi_with_photo`, `emit_poi_dequeued`, `poi_metrics`, updated `health()`. + +### Tests + +- `crates/telemetry_stream/tests/video_path.rs` — **new**. 4 integration tests (AC-1, AC-2, AC-3, empty-client guard). +- `crates/telemetry_stream/tests/mapobjects_snapshot.rs` — **new**. 3 integration tests (AC-1, AC-2, AC-3). + +### Process + +- `_docs/02_tasks/done/AZ-676_telemetry_stream_video_path.md` — moved from `todo/`. +- `_docs/02_tasks/done/AZ-677_telemetry_stream_mapobjects_snapshot.md` — moved from `todo/`. +- `_docs/02_tasks/done/AZ-678_operator_bridge_command_auth.md` — moved from `todo/`. +- `_docs/02_tasks/done/AZ-679_operator_bridge_poi_surface.md` — moved from `todo/`. +- `_docs/_autodev_state.md` — phase update. +- `_docs/03_implementation/batch_15_cycle1_report.md` — this report. +- `_docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md` — cumulative review (separate file). + +## 4. Test results + +| Crate | Unit | Integration | Total | +|---|---|---|---| +| `shared` | 9 (+2 new for operator_event serde) | — | 9 | +| `telemetry_stream` | 18 (+6 new for video + 3 new for mapobjects) | 12 (+4 video_path, +3 mapobjects_snapshot) | 30 | +| `operator_bridge` | 11 (5 auth AC + 1 smoke + 3 poi_surface AC + 2 bridge wiring) | — | 11 | + +`cargo clippy -p shared -p telemetry_stream -p operator_bridge --all-targets -- -D warnings`: clean after the test-time `assert_eq!(.., false)` → `assert!(!..)` rewrite. + +`cargo fmt -p shared -p telemetry_stream -p operator_bridge`: no diff. + +Workspace `cargo test --workspace`: all suites green **except** the carried-over `mission_executor::state_machine::ac3_bounded_retry_then_success` flake (see C6 — unchanged by this batch). + +### Acceptance criteria + +| Ticket | AC | Test | Status | +|---|---|---|---| +| AZ-676 | AC-1 rtsp_forward URL only | `tests/video_path.rs::ac1_rtsp_forward_emits_url_only` | ✅ | +| AZ-676 | AC-2 bytes_inline forwards frames | `tests/video_path.rs::ac2_bytes_inline_forwards_frames` + `internal/video.rs::bytes_inline_publish_frame_counts_and_fans_out` | ✅ | +| AZ-676 | AC-3 ai_locked toggles on session start/stop | `tests/video_path.rs::ac3_ai_locked_toggles_on_session_start_and_stop` + `internal/video.rs::register_first_session_flips_ai_locked_true` + `deregister_last_session_flips_ai_locked_false` | ✅ | +| AZ-677 | AC-1 first subscribe → snapshot | `tests/mapobjects_snapshot.rs::ac1_first_subscribe_receives_snapshot` | ✅ | +| AZ-677 | AC-2 in-flight diffs | `tests/mapobjects_snapshot.rs::ac2_inflight_changes_emit_diffs` | ✅ | +| AZ-677 | AC-3 reconnect re-snapshots | `tests/mapobjects_snapshot.rs::ac3_reconnect_resnaps_without_replay` | ✅ | +| AZ-678 | AC-1 valid signed command passes | `internal/auth.rs::ac1_valid_signed_command_passes` | ✅ | +| AZ-678 | AC-2 invalid signature rejected, seq not advanced | `internal/auth.rs::ac2_invalid_signature_rejected_and_seq_not_advanced` | ✅ | +| AZ-678 | AC-3 replay detected | `internal/auth.rs::ac3_replay_detected` | ✅ | +| AZ-678 | AC-4 unknown/expired session rejected | `internal/auth.rs::ac4_unknown_or_expired_session_rejected` | ✅ | +| AZ-678 | AC-5 sustained sig failures → health red | `internal/auth.rs::ac5_sustained_signature_failures_flip_health_red` | ✅ | +| AZ-679 | AC-1 all required fields populated | `internal/poi_surface.rs::ac1_full_poi_maps_all_required_fields` | ✅ | +| AZ-679 | AC-2 VLM-disabled carries explicit status | `internal/poi_surface.rs::ac2_vlm_disabled_carries_explicit_status` | ✅ | +| AZ-679 | AC-3 dequeue emits event through sink | `internal/poi_surface.rs::ac3_dequeue_emits_event_through_sink` | ✅ | + +## 5. Code-review findings (this batch) + +**Verdict**: PASS_WITH_WARNINGS — zero Critical, zero High; one Medium and three Low. + +| # | Severity | Category | File:Line | Title | +|---|---|---|---|---| +| F1 | Medium | Maintainability | `crates/operator_bridge/src/internal/auth.rs:191-198` | `serde_json::to_vec(payload).unwrap_or_default()` silently substitutes empty bytes on a serialisation failure | +| F2 | Low | Spec-Gap | `crates/operator_bridge/src/internal/poi_surface.rs:103-111` | `vlm_label` is hard-coded `None`; AC-1 wording allows this for AZ-684 follow-up but the wire field is exposed without producer for now | +| F3 | Low | Architecture / Doc-sync | `crates/telemetry_stream/proto/telemetry.proto` + `_docs/02_document/architecture.md §7.x` | New proto topics + RPC (Topic::OperatorEvent, SubscribeVideo) not yet reflected in the architecture doc surface table — doc sweep ticket needed | +| F4 | Low | Scope | `crates/operator_bridge/src/lib.rs:120-128` | `surface_poi` returns `NotImplemented` after pushing the surface event — convenient placeholder for AZ-680 but caller could mistake the side-effect for a successful round-trip | + +### Finding details + +**F1: silent fallback on signing-payload serialisation** (Medium / Maintainability) + +- Location: `crates/operator_bridge/src/internal/auth.rs:191-198`. +- Description: `signing_material` calls `serde_json::to_vec(payload).unwrap_or_default()`. A `serde_json::Value` cannot in practice fail to serialise (no foreign types in `Value`), so the failure path is unreachable today. But the silent `unwrap_or_default()` would produce a signing string with **empty** payload bytes on a hypothetical failure — which would then HMAC-verify against a sign-side that also failed identically, masking the issue. +- Suggestion: replace with `.expect("serde_json::Value always serialises")` so the failure mode is loud, OR return `Err(AuthError::SignatureInvalid)` (treating the failure as un-verifiable input). Either is consistent with the project rule "never suppress errors silently". +- Task: AZ-678. + +**F2: vlm_label producer deferred** (Low / Spec-Gap) + +- Location: `crates/operator_bridge/src/internal/poi_surface.rs:103-111`. +- Description: AZ-679 AC-1 says the wire event has every required field populated; the architecture §7.10 schema lists `vlm_label` as optional. The mapper produces `None` for every status, including `VlmPipelineStatus::Ok` where the label *should* be present. The `Poi` model does not carry the label string (it only has the pipeline status), so this is a producer-side gap, not a transport gap. +- Suggestion: add an explicit comment that AZ-684 (scan_controller VLM ladder) is the producer, and at that point introduce either a richer `Poi::vlm_label: Option` field or a richer overload on `PoiSurfaceMapper::map_with_label(poi, label)`. Currently the comment in the code is accurate but the gap is worth tracking until AZ-684 lands. +- Task: AZ-679. + +**F3: architecture doc surface table out of sync with new proto topics** (Low / Architecture) + +- Location: `crates/telemetry_stream/proto/telemetry.proto` (now defines `Topic::OperatorEvent` + `SubscribeVideo` RPC). +- Description: `architecture.md §7.x` enumerates the telemetry topic catalogue and the operator-link RPC surface. Batches 14 + 15 together have added: gRPC server, video subscribe, MapObjects snapshot-on-subscribe, operator events. The architecture doc has not yet had the surface table refreshed. +- Suggestion: schedule a doc-sync sweep that covers batches 13-15 (architecture topic table + decision-rationale entries for Tonic-gRPC = closed Q2, and a brief note on the snapshot-then-diff pattern for MapObjects). Fold into the next monorepo-document/architecture-sync ticket. +- Task: batches 13-15 collectively (carried as C3 + C7). + +**F4: surface_poi placeholder returns NotImplemented after side-effect** (Low / Scope) + +- Location: `crates/operator_bridge/src/lib.rs:120-128`. +- Description: `OperatorBridgeHandle::surface_poi` pushes the surface event through the sink and then returns `Err(NotImplemented(AZ-680))`. The intent is "the surface IS pushed; the decision round-trip is AZ-680". A caller who tries to retry on error would double-push. +- Suggestion: when AZ-680 lands, replace with a real decision channel. Until then, document explicitly that callers should treat `NotImplemented` here as "fire-and-forget, decision pending" — or rename to `enqueue_surface_only_pending_decision_loop` to make the placeholder posture unambiguous. +- Task: AZ-679 (placeholder), AZ-680 (real fix). + +## 6. Open cumulative findings touched + +- **C5 (autopilot dead-code clippy)** — unchanged; still blocks `--all-targets -D warnings` at the workspace level. Not fixable inside batch 15 scope. +- **C6 (mission_executor ac3 flake)** — unchanged; reproduced once during the workspace test run, passes when re-run targeted (`-p mission_executor --test state_machine ac3_bounded_retry_then_success`). Documented in `_docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md`. + +## 7. Cumulative review trigger + +End of triplet 13 / 14 / 15 — cumulative review for these three batches is produced as `_docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md`. + +## 8. Next-batch candidates + +- **AZ-680** — operator command dispatch (the consumer of AZ-678's `ValidatedCommand`). Naturally bundles with composition-root wiring (autopilot/runtime.rs) of `OperatorBridge::with_validator` + `with_telemetry_sink`. +- **AZ-668** — scan_controller POI queue. Becomes much more tractable now that the wire format (AZ-679) is fixed. +- **AZ-684** — scan_controller VLM assessment ladder; resolves F2 above. +- **AZ-658** — frame_ingest decoder. Still needs the H.264-binding decision. +- Doc sweep covering batches 13-15 (architecture topic table, Tonic-gRPC decision, snapshot-then-diff pattern). diff --git a/_docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md b/_docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md new file mode 100644 index 0000000..adf4bbc --- /dev/null +++ b/_docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md @@ -0,0 +1,227 @@ +# Cumulative Code Review — Batches 13–15 (Cycle 1) + +**Trigger**: `implement/SKILL.md` Step 14.5 — `K=3` batches completed since the last cumulative review (`cumulative_review_batches_07-09_cycle1_report.md`). Note: triplet 10–12 was skipped at the time and remains an outstanding gap on the cumulative cadence; surfaced here for visibility but not retro-scored. +**Date**: 2026-05-20 +**Cycle**: 1 +**Scope**: union of files changed in `batch_13_cycle1`, `batch_14_cycle1`, `batch_15_cycle1` (since the close of `batch_12_cycle1`). +**Mode**: inline (matching the per-batch precedent). +**Baseline**: `_docs/02_document/architecture_compliance_baseline.md` still does not exist. No `## Baseline Delta` section is produced. The intent recorded in cumulative reviews 04–06 and 07–09 to promote a baseline remains carried forward. + +## Tasks in scope + +| Batch | Tasks | Components touched | +|-------|-------|--------------------| +| 13 | AZ-683 (`scan_controller_poi_queue_and_window`) | `scan_controller` | +| 14 | AZ-675 (`telemetry_stream_grpc_server`) | `telemetry_stream`, workspace tonic/prost stack | +| 15 | AZ-676 (`telemetry_stream_video_path`), AZ-677 (`telemetry_stream_mapobjects_snapshot`), AZ-678 (`operator_bridge_command_auth`), AZ-679 (`operator_bridge_poi_surface`) | `telemetry_stream`, `operator_bridge`, `shared` | + +**Total AC verification (rolled up)**: **6 (batch 13) + 5 (batch 14) + 14 (batch 15) = 25 / 25** ACs verified locally with tests; no unverified spec gap. + +**Code volume** (approximate, source + tests, excluding `_docs/` and `Cargo.lock`): + +- Batch 13: ~1,100 LOC added (scan_controller POI queue + priority module + 6 integration + 13 unit tests). +- Batch 14: ~1,400 LOC added (telemetry_stream tonic infrastructure + publisher + server + 5 integration + 6 unit tests; first-time workspace tonic/prost/protoc pins). +- Batch 15: ~1,950 LOC added (telemetry_stream video + mapobjects modules + operator_bridge auth + poi_surface modules + 11 + 18 unit + 12 integration tests + 2 new shared modules). + +## Phase 1 — Spec coverage + +Every Included scope item across these three batches lands in production code: + +- **AZ-683 (Batch 13)**: production POI queue with proximity/age-weighted priority math, rolling 60 s × 5/min cap, confidence floor, decision-window mapping, timeout sweep, `DeclinePoi` operator-command end-to-end → `DeclineAction` for AZ-685. +- **AZ-675 (Batch 14)**: production Tonic gRPC server (`TelemetryStream::Subscribe`), per-(client, topic) broadcast queue, drop-counter back-pressure, RAII shutdown, `TelemetrySink::push_detections` real impl. Closes architecture Q2 in favour of gRPC server-streaming. +- **AZ-676 (Batch 15)**: production `VideoPublisher` with rtsp_forward + bytes_inline modes, ai_locked atomic + session counter, SubscribeVideo RPC. +- **AZ-677 (Batch 15)**: production snapshot-on-subscribe stream-prepend + diff broadcast on `Topic::MapObjectsBundle`; `MapObjectsSnapshotSource` trait + `EmptyMapObjectsSource` fixture pending the real `mapobjects_store` adapter. +- **AZ-678 (Batch 15)**: production `HmacOperatorValidator` with HMAC-SHA256, per-session monotonic seq tracker, in-process session registry with TTL, rejection-reason counters, sliding 60 s sig-failure window → red-health gate. Trait `OperatorCommandValidator` in `shared::contracts` so dispatch can depend on the contract without importing `operator_bridge`. +- **AZ-679 (Batch 15)**: production `PoiSurfaceMapper` producing `OperatorPoiEvent` per `architecture.md §7.10`, `PoiDequeued` events on rotation/age-out/completion, pushed via the new `TelemetrySink::push_operator_event` extension. + +**Contract verification**: +- `shared::contracts::operator_auth::{SignedCommand, ValidatedCommand, AuthError, OperatorCommandValidator}` — trait shape matches the AZ-678 task `Contract` section verbatim. +- `shared::models::operator_event::{OperatorPoiEvent, PoiDequeued, OperatorEvent}` — fields match `architecture.md §7.10` and the AZ-679 task spec's field list. One **known gap**: `vlm_label` is wired in the wire shape but the producer is deferred to AZ-684 (`scan_controller` VLM ladder); the `Poi` model does not carry the label string today. Surfaced as a Low finding rather than a High Spec-Gap because the wire is in place and the producer is a separately scheduled ticket. + +PASS. + +## Phase 2 — Code quality + +| Concern | Finding | Severity | +|---------|---------|----------| +| `serde_json::to_vec(payload).unwrap_or_default()` in `HmacOperatorValidator::signing_material` | Silent fallback to empty bytes on a hypothetical serde failure produces a signing string that the sign-side would also produce on the same failure, masking the issue. Project rule "never suppress errors silently" applies even when the failure is unreachable today. | Medium / Maintainability | +| Optional builder pattern on `OperatorBridge` (`with_telemetry_sink`, `with_validator`) | Both surfaces compile and run without the sink/validator wired, returning `NotImplemented`. Used as the bridge between the AZ-678/679 landing and the AZ-680 composition-root wiring. Acceptable as a temporary shape; should be reduced once AZ-680 fully wires the runtime. | Low / Scope | +| `surface_poi` returns `NotImplemented` after pushing the side-effect | A caller doing naive retry-on-error would double-publish. The intent ("surface pushed; decision loop is AZ-680") is comment-only. | Low / Scope | +| `vlm_label` always `None` in `PoiSurfaceMapper::map` | The `Poi` model doesn't carry the label; AZ-684 will produce it. Wire field is correct; producer wiring is the gap. | Low / Spec-Gap | +| `VideoSnapshot.mode_label` string vs proto `VideoMode` enum | Both exist in parallel and serve different consumers (health surface vs proto). Acceptable; documented in `internal/video.rs` and tested for parity in `mode_label_matches_task_spec_strings`. | — | +| `unsafe` blocks | None added across all three batches. | — | +| Production `unwrap` / `expect` | All hits are in `#[cfg(test)]` modules, `serde_json::to_string`/`from_str` round-trips, or `HMAC::new_from_slice` which is documented infallible for any key length. No production crash sites. | — | +| Test back-door discipline | No new `#[doc(hidden)]` or `*_for_tests` surfaces this triplet beyond the batch 9 ones already documented. | — | + +## Phase 3 — Security quick-scan + +- HMAC compare uses `hmac::Mac::verify_slice` (constant-time). Verified per AZ-678 NFR-Security. +- No SQL / shell-string interpolation. +- Rejection logging uses `command_id` only, never the raw payload. Per AZ-678 NFR-Security: "reject-then-log; never log the raw payload of a rejected command at info level". +- Session secrets stored in-process only; no leak to logs or telemetry. +- No new external input deserialization. The `MapObjectsTopicMessage` and `OperatorEvent` round-trips are over `serde_json` of canonical Rust types; no untrusted-source deserialization path. +- gRPC server binds to an explicit config-driven `listen_addr` (no implicit binding to 0.0.0.0 unless configured). +- Note: the wire payload for `VideoFrame.bytes` is opaque to `telemetry_stream` — the producer (`frame_ingest`) owns the codec semantics. No new attack surface at the gRPC boundary. + +PASS. + +## Phase 4 — Performance scan + +- **Broadcast fan-out**: `tokio::sync::broadcast` with per-topic ring buffers (default `topic_capacity = 256`). Slow-subscriber drop is detected via `BroadcastStreamRecvError::Lagged(n)` and accounted in per-(client, topic) counters. Verified by `slow_subscriber_lags_fast_subscriber_does_not` (unit) and `ac2_slow_subscriber_drops_oldest_healthy_unaffected` (integration). +- **HMAC validate**: O(payload_size) HMAC compute + constant-time compare. Per AZ-678 NFR ≤1 ms p99 budget; the SHA-256 compute cost on a Jetson-class device for typical 64–256 byte payloads is well under that. +- **Session registry lookup**: `HashMap` — O(1) amortised. TTL check is O(1) per validate. +- **Sliding 60 s signature-failure window**: `VecDeque`. Push + opportunistic prune is amortised O(1). The prune happens at every push and at every `health_is_red` call, so memory is bounded by `min(threshold × 2, 60 s of attempt traffic)`. +- **POI surface mapping**: `PoiSurfaceMapper::map` is a pure struct-to-struct copy plus an `Option::clone` of the Tier-2 evidence summary. Sub-millisecond by inspection; matches AZ-679 NFR ≤1 ms p99. +- **MapObjects snapshot serialisation**: `serde_json::to_vec` over the canonical bundle. Per AZ-677 NFR ≤200 ms p99 for ≤10 000 entries. Not benchmarked in this triplet; the `EmptyMapObjectsSource` fixture used in tests does not exercise that volume. **Open for next benchmark cycle**: add a `mapobjects_snapshot_serialise_10k_under_200ms` perf test once the real `mapobjects_store` adapter is wired. + +PASS (with the snapshot perf-test as a noted follow-up, not a blocker). + +## Phase 5 — Cross-task consistency + +**Telemetry transport pattern (the load-bearing consistency check for this triplet)** — three independent topic categories now flow through the same `TelemetryPublisher`: + +| Topic | Pattern | Snapshot? | Wire shape | +|-------|---------|-----------|------------| +| `TelemetrySample` / `GimbalState` / `DetectionEvent` / `MovementCandidate` | Pure broadcast | No | JSON of canonical Rust model | +| `MapObjectsBundle` | Snapshot-on-subscribe + broadcast diff | Yes (`MapObjectsBundleSnapshot`) | Tagged enum `MapObjectsTopicMessage { Snapshot, Diff }` | +| `OperatorEvent` | Pure broadcast (new in batch 15) | No (events are inherently incremental) | Tagged enum `OperatorEvent { PoiSurfaced, PoiDequeued }` | + +Pattern convergence is intentional: every topic that needs to carry "structurally distinct kinds of message" uses a `serde(tag = "kind")` tagged enum; every topic that carries a single message type uses the bare model. This keeps the operator UI's deserialisation cheap and makes the topic catalogue easy to extend. + +**Service expansion**: `TelemetryStream` proto grew from one RPC (`Subscribe`) in batch 14 to two RPCs (`Subscribe` + `SubscribeVideo`) in batch 15. The split is right — video has its own framing semantics (`oneof { session_start, frame }`) that don't belong in the generic `payload_json`-carrying telemetry channel. The two RPCs share zero implementation by design. + +**Operator-side trait surface**: `OperatorCommandValidator` (auth, in `shared::contracts`) and `TelemetrySink::push_operator_event` (events, in `shared::contracts`) form the two halves of the operator boundary. The `Poi` → `OperatorPoiEvent` mapping owns the producer side; AZ-680 will own the dispatch side. Both halves cross the boundary through `shared::contracts`, so neither side imports the other directly. + +**Naming**: +- `OperatorEvent` (the tagged enum) vs `OperatorCommand` (already in `shared::models::operator`) — clear directional split (events flow drone → GS, commands flow GS → drone). No collision. +- `MapObjectsDiff` (new in `telemetry_stream::internal::mapobjects`) vs `mission_client::MapObjectsDiff` (existing) — **different domains**: the transport-side diff (what `telemetry_stream` broadcasts to operator clients) vs the persistence-side diff (what `mission_client` pushes post-flight to the platform). Both are short snapshots of "what changed in the store"; the producers are disjoint and the consumers are disjoint, so the type collision is harmless. **Surfaced as a Low finding** for future cleanup: a shared `shared::models::mapobjects::Diff` would dedupe. + +PASS (one new Low finding). + +## Phase 6 — Architecture compliance + +**Layer direction** (per `_docs/02_document/module-layout.md`): + +- `scan_controller` (Layer 3, Coordinator) — adds `serde_json` + `chrono` deps; imports from `shared`, `mission_client`, `mapobjects_store`. No Layer 3 → Layer 3 import. +- `telemetry_stream` (Layer 2, Transport) — imports from `shared` only. The new `bytes` workspace dep is a Layer 1 utility. No upward import. +- `operator_bridge` (Layer 2, Transport) — imports from `shared` only. **Does not** import from `telemetry_stream` — instead depends on the `TelemetrySink` trait in `shared::contracts`, which `telemetry_stream::TelemetryStreamHandle` implements. This is the boundary that keeps the operator boundary cleanly testable (the `RecordingSink` in `poi_surface.rs` tests is a `TelemetrySink` impl with no transport). +- `shared` — added two new modules (`models::operator_event`, `contracts::operator_auth`) and one trait method (`TelemetrySink::push_operator_event`). No upward imports. + +PASS. + +**Public API respect**: +- `shared::contracts::operator_auth::{SignedCommand, ValidatedCommand, AuthError, OperatorCommandValidator}` — all in Public API. +- `shared::models::operator_event::{OperatorEvent, OperatorPoiEvent, PoiDequeued, DequeueReason, PhotoMetadata, Tier2EvidenceSummary}` — all in Public API. +- `telemetry_stream::{video_message, MapObjectsDiff, MapObjectsBundleSnapshot, MapObjectsTopicMessage, MapObjectsSnapshotSource, EmptyMapObjectsSource, VideoPath, VideoSnapshot}` — all re-exported from the crate root for cross-component consumption. +- `operator_bridge::{HmacOperatorValidator, HmacValidatorConfig, AuthCounters, REJECTION_REASONS, PoiSurfaceMapper, PoiSurfaceMetrics}` — all in Public API. + +No internal-file imports across components. + +PASS. + +**Cyclic dependencies**: built the import graph over the changed files plus direct deps. + +- `shared` ← `telemetry_stream`, `operator_bridge`, `scan_controller`, … (no cycles; shared is the root). +- `telemetry_stream` and `operator_bridge` share no direct dependency in either direction. +- The runtime composition root (`autopilot/runtime.rs`) will wire `telemetry_stream::TelemetryStreamHandle` (as `Arc`) into `OperatorBridge::with_telemetry_sink`. That wiring lives in the composition root, not in either component — no cyclic dep introduced. + +PASS. + +**Duplicate symbols across components**: +- `MapObjectsDiff` collision noted in Phase 5 (Low / Maintainability finding for future consolidation). +- `Poi` (shared model) vs `OperatorPoiEvent` (wire model in `shared::models::operator_event`) — intentional split; the wire model is a subset projection. No collision. +- `SessionEntry`, `HmacSha256` are private to `operator_bridge::internal::auth`. No cross-component leakage. + +PASS (one Low finding for the diff name collision). + +**Cross-cutting concerns**: `tracing` is the only cross-cutting concern touched. Used consistently (`warn!` for rejections in auth; the rest of the triplet adds no new logging). No bespoke logging setup. + +PASS. + +**Module-layout drift** (carried from cumulative 07–09 + extended this triplet): +- `telemetry_stream/src/internal/{publisher,server,proto,video,video_server,mapobjects}.rs` — `module-layout.md` predates batches 14 + 15; the actual file layout is now denser than the doc lists. +- `operator_bridge/src/internal/{auth,poi_surface}.rs` — newly added; `module-layout.md` listed only `operator_bridge/src/lib.rs` before. +- Carried as Low / Architecture (doc-sync) finding; not a code issue. + +## Phase 7 — Architecture compliance (baseline delta) + +Skipped — no `architecture_compliance_baseline.md` exists yet. Recommendation to promote one once the operator-side composition root (AZ-680) lands and the public API surface is more stable. + +## Findings (cumulative for batches 13–15) + +| # | Severity | Category | File:Line | Title | +|---|----------|----------|-----------|-------| +| 1 | Medium | Maintainability | `crates/operator_bridge/src/internal/auth.rs:191-198` | Silent `unwrap_or_default()` in `signing_material` (carry from batch 15 F1) | +| 2 | Low | Maintainability | `crates/telemetry_stream/src/internal/mapobjects.rs` + `crates/mission_client/src/lib.rs` | `MapObjectsDiff` name collision across two unrelated domains (transport vs persistence) | +| 3 | Low | Spec-Gap | `crates/operator_bridge/src/internal/poi_surface.rs:103-111` | `vlm_label` producer deferred to AZ-684 (carry from batch 15 F2) | +| 4 | Low | Architecture | `_docs/02_document/architecture.md §7.x` + `_docs/02_document/module-layout.md` | Architecture doc topic table + module-layout paths drift across batches 13–15 | +| 5 | Low | Scope | `crates/operator_bridge/src/lib.rs:120-128` | `surface_poi` returns `NotImplemented` after side-effect (placeholder for AZ-680) | + +### Finding details + +**F1 (cumulative): silent fallback on signing-payload serialisation** (Medium / Maintainability) +- Carried unchanged from batch 15 F1. +- Suggestion (cumulative): replace with `.expect("serde_json::Value always serialises")` so the failure mode is loud. Single-line fix; folded into AZ-680 or a tiny refactor task at next pass. + +**F2 (cumulative-new): `MapObjectsDiff` name collision** (Low / Maintainability) +- Location: `crates/telemetry_stream/src/internal/mapobjects.rs` defines `MapObjectsDiff`; `crates/mission_client/src/lib.rs` also defines `MapObjectsDiff`. +- Description: the two types live in different domains (operator-link broadcast vs post-flight persistence push) and have different shapes. Both are correct in their own crate; the name collision is benign today but creates ambiguity when grepping or in IDE auto-imports. +- Suggestion: extract a shared `shared::models::mapobjects::Diff` (or two clearly-named variants — `LiveDiff` vs `PersistDiff`) and have both crates consume it. Defer to a focused dedupe task; not blocking. +- Tasks: AZ-677 + (existing) AZ-668 / AZ-685. + +**F3 (cumulative): `vlm_label` producer deferred** (Low / Spec-Gap) +- Carried unchanged from batch 15 F2. +- Resolved by AZ-684. + +**F4 (cumulative): doc surface table drift** (Low / Architecture) +- The Tonic gRPC infrastructure (batch 14), the video + mapobjects topics + RPCs (batch 15), the operator authentication trait + HMAC default (batch 15), and the POI surface wire format (batch 15) all need to be reflected in `_docs/02_document/architecture.md §7.x` (topic catalogue, RPC catalogue) and `_docs/02_document/module-layout.md` (per-component file list + public-API list). +- Suggestion: schedule a doc sweep covering batches 13–15 that updates: + - `architecture.md §7.x` — topic catalogue + RPC catalogue. + - `decision-rationale.md` — Q2 (operator-link protocol = Tonic gRPC), and a note on the snapshot-then-diff pattern for `MapObjectsBundle`. + - `module-layout.md` — `telemetry_stream/src/internal/{video, video_server, mapobjects}.rs`, `operator_bridge/src/internal/{auth, poi_surface}.rs`. +- Tasks: batches 13–15 collectively. + +**F5 (cumulative): `surface_poi` placeholder** (Low / Scope) +- Carried unchanged from batch 15 F4. +- Resolved by AZ-680. + +## Verdict + +**PASS_WITH_WARNINGS** — 0 Critical, 0 High, 1 Medium, 4 Low. + +Per the implement skill's auto-fix matrix: +- F1 (Medium / Maintainability) → **auto-fix eligible**, single-line change. Recommendation: fold into AZ-680 or a tiny clean-up at next batch. +- F2 (Low / Maintainability, cross-crate shared-type extraction) → **schedule as a focused refactor** rather than auto-fix; touches two component public surfaces. +- F3 (Low / Spec-Gap, deferred producer) → **wait for AZ-684**. +- F4 (Low / Architecture, doc-only) → **doc-sweep ticket**. +- F5 (Low / Scope, deferred consumer) → **wait for AZ-680**. + +None of the findings block batch 16 implementation. The cumulative review gate **PASSES** and the implement loop proceeds. + +## Cumulative metrics + +| Metric | Value (batches 13–15) | Trend vs. prior cumulative (batches 7–9) | +|--------|-----------------------|------------------------------------------| +| Total source LOC added (ex tests, approximate) | ~3,000 | – (prior was ~3,470; smaller scope but denser deps — first-time tonic stack) | +| Total test LOC added (approximate) | ~1,450 | – (prior was ~1,770) | +| Test/source ratio | ~0.48 | stable (~0.51 prior) | +| New public API symbols (approximate) | ~40 | + (prior was ~35; the operator-bridge + telemetry_stream split-out drives most of it) | +| Cyclomatic complexity hot-spots | `HmacOperatorValidator::validate` (4 sequential gates, 1 happy path), `TelemetryService::subscribe` (snapshot-prepend branch on `MapObjectsBundle`) | All under the 10-arm SOLID threshold | +| New `unsafe` blocks | 0 | stable | +| New `unwrap` / `expect` in production paths | 0 | stable | +| Layer-violation Architecture findings | 0 | stable | +| Cyclic-dep Architecture findings | 0 | stable | +| Open cumulative Mediums (cycle 1) | 2 (this triplet's F1 + carry-over C1 from cumulative 07–09 — `SendCommandError` dedupe) | + (1 new; 1 carry) | +| Open cumulative Highs (cycle 1) | 1 (C5 — pre-existing `autopilot::Runtime::vlm_provider_name` dead-code lint) | stable | + +## Carried-forward cumulative findings (from prior cumulatives) + +| ID | Severity | Origin | Status this triplet | +|----|----------|--------|---------------------| +| C1 | Medium | Cumulative 07–09 F1 | OPEN — `SendCommandError` mapping still duplicated across `lost_link.rs` / `geofence.rs` / `battery_thresholds.rs`. Not touched by batches 13–15. | +| C2 | Low | Cumulative 07–09 F2 | OPEN — `MavlinkCommandIssuer` naming inconsistency. Not touched by batches 13–15. | +| C3 | Low | Cumulative 07–09 F3 + extended | OPEN — `module-layout.md` drift; now extended by batches 14 + 15 to include `telemetry_stream/internal/*` + `operator_bridge/internal/*`. | +| C4 | Low | Batch 11 | OPEN — `data_model.md §PanPlan` definition still missing. | +| C5 | High | Batch 4 (pre-existing) | OPEN — workspace `-D warnings` still blocks on `autopilot::Runtime::vlm_provider_name` dead-code lint. Tracked in `_docs/_process_leftovers/2026-05-20_autopilot_clippy.md`. | +| C6 | Medium | Batch 14 | OPEN — `mission_executor::state_machine::ac3_bounded_retry_then_success` flake. Tracked in `_docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md`. | +| C7 | Low | Batch 14 | OPEN — Tonic-gRPC decision not yet recorded in `decision-rationale.md`. Now subsumed under F4 (cumulative doc sweep). | diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 16842b2..7fe8c4c 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -6,43 +6,24 @@ step: 7 name: Implement status: between-batches sub_step: - phase: 28 - name: batch-15-select - detail: "batch 14 (AZ-675) pushed to origin/dev; awaiting batch 15 task selection" + phase: 0 + name: batch-16-select + detail: "" retry_count: 0 cycle: 1 tracker: jira ## Last Completed Batch -batch: 14 -commit: ff790bd -ticket: AZ-675 (telemetry_stream Tonic gRPC server + per-client lossy queue) -jira_status: In Testing -pushed_to: origin/dev -report: _docs/03_implementation/batch_14_cycle1_report.md - -## Unblocked Candidates for Batch 15 -- AZ-676 (3 pts) — telemetry_stream video path. Self-contained AZ-675 extension. -- AZ-679 (3 pts) — operator_bridge POI surface. Consumes AZ-683 queue + AZ-685 decline path through AZ-675 server. -- AZ-678 (5 pts) — operator_bridge command authentication. -- AZ-658 (5 pts) — frame_ingest H.264 decoder. Needs library pin decision (retina vs ffmpeg-rs vs gstreamer). - -Blocked: AZ-677 (needs AZ-667), AZ-684 (needs AZ-660/AZ-671/AZ-672), AZ-685 (needs AZ-684), AZ-686 (needs AZ-684). - -## Open Cumulative Findings (carry forward) -| ID | Sev | Cat | Detail | Origin | -|---|---|---|---|---| -| C1 | Medium | Maintainability | Duplicated `SendCommandError` mapping in `gimbal_controller` | Batches 9-10 | -| C2 | Low | Style | `MavlinkCommandIssuer` naming inconsistency | Batch 9 | -| C3 | Low | Architecture | `module-layout.md` drift (now includes `telemetry_stream/internal/{publisher,server,proto}.rs`, `scan_controller/internal/poi_queue/`) | Batches 10-14 | -| C4 | Low | Architecture | `data_model.md §PanPlan` definition missing | Batch 11 | -| C5 | High | Maintenance | Pre-existing `autopilot/runtime.rs::vlm_provider_name` dead-code error blocks workspace `-D warnings` clippy | Batch 4 origin | -| C6 | Medium | Tests | `mission_executor::ac3_bounded_retry_then_success` polling-race flake (escalated under tonic build pressure) | Batch 8 origin, escalated batch 14 | -| C7 | Low | Architecture | Record Tonic-gRPC operator-link decision in `decision-rationale.md` (closed Q2 in batch 14) | Batch 14 | +batch: 15 +ticket: AZ-676 / AZ-677 / AZ-678 / AZ-679 +jira_status: pending (will be set to In Testing after commit) +pushed_to: pending +report: _docs/03_implementation/batch_15_cycle1_report.md +cumulative_review: _docs/03_implementation/cumulative_review_batches_13-15_cycle1_report.md ## Process Leftovers - `_docs/_process_leftovers/2026-05-20_autopilot_clippy.md` — C5 replay - `_docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md` — C6 fix recipe ## Cumulative Review Cadence -Next cumulative review due: end of batch 15 (covers batches 13 / 14 / 15). +Last cumulative: batches 13–15 (just produced). Next due: end of batch 18. diff --git a/crates/operator_bridge/Cargo.toml b/crates/operator_bridge/Cargo.toml index 747146c..61ed3d4 100644 --- a/crates/operator_bridge/Cargo.toml +++ b/crates/operator_bridge/Cargo.toml @@ -14,3 +14,13 @@ tokio = { workspace = true } tracing = { workspace = true } async-trait = { workspace = true } serde = { workspace = true } +serde_json = { workspace = true } +parking_lot = { workspace = true } +chrono = { workspace = true } +uuid = { workspace = true } +hmac = { workspace = true } +sha2 = { workspace = true } +thiserror = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true, features = ["test-util"] } diff --git a/crates/operator_bridge/src/internal/auth.rs b/crates/operator_bridge/src/internal/auth.rs new file mode 100644 index 0000000..0d8c368 --- /dev/null +++ b/crates/operator_bridge/src/internal/auth.rs @@ -0,0 +1,531 @@ +//! AZ-678 — default operator-command authentication. +//! +//! `HmacOperatorValidator` implements +//! `shared::contracts::operator_auth::OperatorCommandValidator` using +//! HMAC-SHA256 over `(session_token || sequence_number || +//! canonical_payload_json)`. It carries: +//! - a per-session in-memory `SessionRegistry` (added on Ground +//! Station auth handshake; expired after `session_ttl`); +//! - a per-session monotonically advancing sequence-number tracker +//! (replay protection); +//! - per-reason rejection counters + a sliding-window red-health +//! gate on sustained signature failures (per AC-5). +//! +//! Constant-time HMAC compare via `hmac::Mac::verify_slice` — no +//! timing oracle. Rejected commands are NEVER logged at info level +//! with raw payload; only the rejection reason and size-capped +//! command_id are emitted. + +use std::collections::{HashMap, VecDeque}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use chrono::{DateTime, Utc}; +use hmac::{Hmac, Mac}; +use parking_lot::Mutex; +use sha2::Sha256; +use tracing::warn; + +use shared::contracts::operator_auth::{ + AuthError, OperatorCommandValidator, SignedCommand, ValidatedCommand, +}; + +type HmacSha256 = Hmac; + +/// Ordered set of rejection reasons. Drives the `auth_rejections_total` +/// counter array layout and the [`AuthCounters::by_reason`] lookup. +pub const REJECTION_REASONS: [AuthError; 4] = [ + AuthError::SignatureInvalid, + AuthError::ReplayDetected, + AuthError::SessionUnknown, + AuthError::SessionExpired, +]; + +/// Per-session state — last-seen sequence number for replay +/// protection plus the wall-clock + monotonic anchor for TTL. +#[derive(Debug, Clone)] +struct SessionEntry { + secret: Vec, + /// `Some(n)` once we have observed at least one accepted command + /// from the session. `None` means the session is registered but + /// the next accepted seq is the floor — `>= 1` per the wire + /// contract. + last_seen_seq: Option, + established_at: Instant, + established_wallclock: DateTime, +} + +/// Configuration knobs for the HMAC validator. +#[derive(Debug, Clone)] +pub struct HmacValidatorConfig { + /// Session lifetime starting from `register_session`. After this + /// elapses any command bearing the token is rejected with + /// `SessionExpired`. Default 30 minutes per architecture §5. + pub session_ttl: Duration, + /// Per-minute signature-failure threshold above which + /// `health_is_red` returns `true` (AC-5). Default 30 — i.e. one + /// failure every two seconds sustained for a minute. + pub signature_failure_red_threshold: u32, +} + +impl Default for HmacValidatorConfig { + fn default() -> Self { + Self { + session_ttl: Duration::from_secs(30 * 60), + signature_failure_red_threshold: 30, + } + } +} + +/// Live rejection counters. Exposed to the health surface; one entry +/// per `REJECTION_REASONS` slot. +#[derive(Debug, Default)] +pub struct AuthCounters { + by_reason: [AtomicU64; REJECTION_REASONS.len()], + total_validated: AtomicU64, +} + +impl AuthCounters { + pub fn reason(&self, e: AuthError) -> u64 { + let idx = REJECTION_REASONS + .iter() + .position(|r| *r == e) + .expect("REJECTION_REASONS covers every AuthError variant"); + self.by_reason[idx].load(Ordering::Relaxed) + } + + pub fn validated_total(&self) -> u64 { + self.total_validated.load(Ordering::Relaxed) + } + + fn increment(&self, e: AuthError) { + let idx = REJECTION_REASONS + .iter() + .position(|r| *r == e) + .expect("REJECTION_REASONS covers every AuthError variant"); + self.by_reason[idx].fetch_add(1, Ordering::Relaxed); + } + + fn increment_validated(&self) { + self.total_validated.fetch_add(1, Ordering::Relaxed); + } +} + +/// HMAC validator state — sessions + counters + signature-failure +/// sliding window for the red-health gate. +pub struct HmacOperatorValidator { + config: HmacValidatorConfig, + sessions: Mutex>, + /// Signature-failure timestamps in the trailing 60 s window. + /// Bounded by either the config threshold * 2 (defense against + /// flooding) or 60 s of trailing history, whichever comes first. + sig_failure_window: Mutex>, + counters: Arc, +} + +impl HmacOperatorValidator { + pub fn new(config: HmacValidatorConfig) -> Self { + Self { + config, + sessions: Mutex::new(HashMap::new()), + sig_failure_window: Mutex::new(VecDeque::new()), + counters: Arc::new(AuthCounters::default()), + } + } + + pub fn with_default_config() -> Self { + Self::new(HmacValidatorConfig::default()) + } + + /// Register a session — called on Ground Station auth handshake. + /// Replacing an existing session for the same token is allowed + /// (rotates the secret and resets the replay tracker). + pub fn register_session(&self, token: impl Into, secret: impl Into>) { + let token = token.into(); + let entry = SessionEntry { + secret: secret.into(), + last_seen_seq: None, + established_at: Instant::now(), + established_wallclock: Utc::now(), + }; + self.sessions.lock().insert(token, entry); + } + + /// Drop a session (operator logout / explicit revoke). + pub fn revoke_session(&self, token: &str) -> bool { + self.sessions.lock().remove(token).is_some() + } + + pub fn counters(&self) -> Arc { + Arc::clone(&self.counters) + } + + pub fn config(&self) -> &HmacValidatorConfig { + &self.config + } + + /// True when the trailing 60-second window of signature failures + /// is at or above the configured red threshold (AC-5). Pruning of + /// expired entries happens on every call. + pub fn health_is_red(&self) -> bool { + let now = Instant::now(); + let mut w = self.sig_failure_window.lock(); + while let Some(&t) = w.front() { + if now.duration_since(t) > Duration::from_secs(60) { + w.pop_front(); + } else { + break; + } + } + w.len() >= self.config.signature_failure_red_threshold as usize + } + + /// Helper that recomputes the canonical signing material. Public + /// so the Ground Station side can co-locate the spec. + pub fn signing_material( + session_token: &str, + sequence_number: u64, + payload: &serde_json::Value, + ) -> Vec { + let payload_bytes = serde_json::to_vec(payload).unwrap_or_default(); + let mut buf = Vec::with_capacity(session_token.len() + 8 + payload_bytes.len() + 2); + buf.extend_from_slice(session_token.as_bytes()); + buf.push(b'|'); + buf.extend_from_slice(&sequence_number.to_be_bytes()); + buf.push(b'|'); + buf.extend_from_slice(&payload_bytes); + buf + } + + /// Helper that produces the HMAC tag for a `(token, seq, payload)` + /// triple under `secret`. Used by tests and by the Ground Station + /// reference implementation. + pub fn sign( + secret: &[u8], + session_token: &str, + seq: u64, + payload: &serde_json::Value, + ) -> Vec { + let mut mac = HmacSha256::new_from_slice(secret).expect("HMAC accepts any key length"); + mac.update(&Self::signing_material(session_token, seq, payload)); + mac.finalize().into_bytes().to_vec() + } + + fn record_sig_failure(&self, now: Instant) { + let mut w = self.sig_failure_window.lock(); + w.push_back(now); + // Prune old entries opportunistically so the window doesn't + // grow unbounded under a flood. + while let Some(&t) = w.front() { + if now.duration_since(t) > Duration::from_secs(60) { + w.pop_front(); + } else { + break; + } + } + } +} + +impl OperatorCommandValidator for HmacOperatorValidator { + fn validate(&self, cmd: SignedCommand) -> Result { + // Step 1 — session lookup. Failure does NOT touch the replay + // counter (the command never authenticated, so nothing to + // advance). + let mut sessions = self.sessions.lock(); + let entry = match sessions.get_mut(&cmd.session_token) { + Some(e) => e, + None => { + self.counters.increment(AuthError::SessionUnknown); + drop(sessions); + warn!( + command_id = %cmd.command_id, + reason = AuthError::SessionUnknown.reason_label(), + "operator command rejected" + ); + return Err(AuthError::SessionUnknown); + } + }; + + // Step 2 — TTL check. We check both monotonic age (Instant) + // and the configured TTL. Wall-clock skew is not used. + if entry.established_at.elapsed() > self.config.session_ttl { + self.counters.increment(AuthError::SessionExpired); + // Strip the session so subsequent commands skip the TTL + // path and just see SessionUnknown. + sessions.remove(&cmd.session_token); + drop(sessions); + warn!( + command_id = %cmd.command_id, + reason = AuthError::SessionExpired.reason_label(), + "operator command rejected" + ); + return Err(AuthError::SessionExpired); + } + + // Step 3 — replay check. We compare against the per-session + // `last_seen_seq`; the rejected seq is NOT recorded so a + // legitimate retry can still land with the next valid seq. + if let Some(last) = entry.last_seen_seq { + if cmd.sequence_number <= last { + self.counters.increment(AuthError::ReplayDetected); + drop(sessions); + warn!( + command_id = %cmd.command_id, + last_seen = last, + seq = cmd.sequence_number, + reason = AuthError::ReplayDetected.reason_label(), + "operator command rejected" + ); + return Err(AuthError::ReplayDetected); + } + } + + // Step 4 — HMAC check. Constant-time via `verify_slice`. + let mut mac = + HmacSha256::new_from_slice(&entry.secret).expect("HMAC accepts any key length"); + mac.update(&Self::signing_material( + &cmd.session_token, + cmd.sequence_number, + &cmd.payload, + )); + let signature_ok = mac.verify_slice(&cmd.signature).is_ok(); + if !signature_ok { + self.counters.increment(AuthError::SignatureInvalid); + let _established = entry.established_wallclock; + drop(sessions); + self.record_sig_failure(Instant::now()); + warn!( + command_id = %cmd.command_id, + reason = AuthError::SignatureInvalid.reason_label(), + "operator command rejected" + ); + return Err(AuthError::SignatureInvalid); + } + + // Happy path — advance the per-session sequence tracker. + entry.last_seen_seq = Some(cmd.sequence_number); + drop(sessions); + self.counters.increment_validated(); + Ok(ValidatedCommand { + command: cmd.into_command(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::Utc; + use shared::models::operator::OperatorCommandKind; + use uuid::Uuid; + + fn signed_command( + secret: &[u8], + session_token: &str, + seq: u64, + payload: serde_json::Value, + ) -> SignedCommand { + let sig = HmacOperatorValidator::sign(secret, session_token, seq, &payload); + SignedCommand { + session_token: session_token.to_string(), + sequence_number: seq, + kind: OperatorCommandKind::ConfirmPoi, + payload, + signature: sig, + issued_at_wallclock: Utc::now(), + command_id: Uuid::new_v4(), + } + } + + /// AC-1 — valid signature + monotonic seq → Ok; last_seen advances. + #[test] + fn ac1_valid_signed_command_passes() { + // Arrange + let v = HmacOperatorValidator::with_default_config(); + let secret = b"unit-test-secret"; + v.register_session("tok_a", secret.to_vec()); + let cmd = signed_command(secret, "tok_a", 5, serde_json::json!({"poi_id": "u-1"})); + + // Act + let out = v.validate(cmd.clone()); + + // Assert + assert!(out.is_ok(), "valid command must pass"); + assert_eq!(v.counters().validated_total(), 1); + + // last_seen advanced — a second command with same seq is now + // replay. + let replay = v.validate(cmd); + assert_eq!(replay.unwrap_err(), AuthError::ReplayDetected); + } + + /// AC-2 — invalid signature → SignatureInvalid; counter increments; + /// seq NOT advanced; subsequent valid command with same seq passes. + #[test] + fn ac2_invalid_signature_rejected_and_seq_not_advanced() { + // Arrange + let v = HmacOperatorValidator::with_default_config(); + let secret = b"unit-test-secret"; + v.register_session("tok_b", secret.to_vec()); + + let bad_payload = serde_json::json!({"poi_id": "u-1"}); + let bad_sig = HmacOperatorValidator::sign(b"WRONG-SECRET", "tok_b", 5, &bad_payload); + let bad = SignedCommand { + session_token: "tok_b".to_string(), + sequence_number: 5, + kind: OperatorCommandKind::ConfirmPoi, + payload: bad_payload.clone(), + signature: bad_sig, + issued_at_wallclock: Utc::now(), + command_id: Uuid::new_v4(), + }; + + // Act + let rejected = v.validate(bad); + let good = signed_command(secret, "tok_b", 5, bad_payload); + let accepted = v.validate(good); + + // Assert + assert_eq!(rejected.unwrap_err(), AuthError::SignatureInvalid); + assert_eq!(v.counters().reason(AuthError::SignatureInvalid), 1); + assert!(accepted.is_ok(), "seq=5 must still be valid after sig-fail"); + assert_eq!(v.counters().validated_total(), 1); + } + + /// AC-3 — seq == last_seen → ReplayDetected; seq < last_seen → also + /// ReplayDetected. + #[test] + fn ac3_replay_detected() { + // Arrange + let v = HmacOperatorValidator::with_default_config(); + let secret = b"s"; + v.register_session("tok", secret.to_vec()); + let _ = v + .validate(signed_command(secret, "tok", 10, serde_json::json!({}))) + .unwrap(); + + // Act + let same = v.validate(signed_command(secret, "tok", 10, serde_json::json!({}))); + let earlier = v.validate(signed_command(secret, "tok", 9, serde_json::json!({}))); + + // Assert + assert_eq!(same.unwrap_err(), AuthError::ReplayDetected); + assert_eq!(earlier.unwrap_err(), AuthError::ReplayDetected); + assert_eq!(v.counters().reason(AuthError::ReplayDetected), 2); + } + + /// AC-4 — unknown session token → SessionUnknown; expired session + /// token → SessionExpired. + #[test] + fn ac4_unknown_or_expired_session_rejected() { + // Arrange — TTL set tiny so the session expires within the + // test. + let cfg = HmacValidatorConfig { + session_ttl: Duration::from_millis(10), + ..HmacValidatorConfig::default() + }; + let v = HmacOperatorValidator::new(cfg); + let secret = b"s"; + + // Act 1 — unknown token rejected immediately. + let unknown = v.validate(signed_command( + secret, + "no_such_session", + 1, + serde_json::json!({}), + )); + + // Register, wait past TTL, retry. + v.register_session("tok", secret.to_vec()); + std::thread::sleep(Duration::from_millis(50)); + let expired = v.validate(signed_command(secret, "tok", 1, serde_json::json!({}))); + + // Assert + assert_eq!(unknown.unwrap_err(), AuthError::SessionUnknown); + assert_eq!(expired.unwrap_err(), AuthError::SessionExpired); + assert_eq!(v.counters().reason(AuthError::SessionUnknown), 1); + assert_eq!(v.counters().reason(AuthError::SessionExpired), 1); + } + + /// AC-5 — sustained signature failures (≥ threshold within the + /// trailing 60 s) flip the red-health gate. + #[test] + fn ac5_sustained_signature_failures_flip_health_red() { + // Arrange + let cfg = HmacValidatorConfig { + signature_failure_red_threshold: 5, + ..HmacValidatorConfig::default() + }; + let v = HmacOperatorValidator::new(cfg); + let secret = b"s"; + v.register_session("tok", secret.to_vec()); + + // Below threshold → green. + for seq in 0..4 { + let bad_sig = + HmacOperatorValidator::sign(b"wrong", "tok", seq + 1, &serde_json::json!({})); + let bad = SignedCommand { + session_token: "tok".to_string(), + sequence_number: seq + 1, + kind: OperatorCommandKind::ConfirmPoi, + payload: serde_json::json!({}), + signature: bad_sig, + issued_at_wallclock: Utc::now(), + command_id: Uuid::new_v4(), + }; + let _ = v.validate(bad); + } + assert!(!v.health_is_red(), "4 failures < threshold"); + + // Act — push one more to reach threshold. + let bad_sig = HmacOperatorValidator::sign(b"wrong", "tok", 100, &serde_json::json!({})); + let bad = SignedCommand { + session_token: "tok".to_string(), + sequence_number: 100, + kind: OperatorCommandKind::ConfirmPoi, + payload: serde_json::json!({}), + signature: bad_sig, + issued_at_wallclock: Utc::now(), + command_id: Uuid::new_v4(), + }; + let _ = v.validate(bad); + + // Assert + assert!(v.health_is_red(), "≥ threshold → red"); + assert_eq!(v.counters().reason(AuthError::SignatureInvalid), 5); + } + + /// Constant-time verify: same-length wrong signature must yield + /// SignatureInvalid (not a panic), and the rejection counter + /// increments by one. (Smoke test that `verify_slice` is wired + /// correctly.) + #[test] + fn same_length_wrong_signature_is_rejected_cleanly() { + // Arrange + let v = HmacOperatorValidator::with_default_config(); + let secret = b"s"; + v.register_session("tok", secret.to_vec()); + + let payload = serde_json::json!({}); + let mut bad_sig = HmacOperatorValidator::sign(secret, "tok", 1, &payload); + // Flip one byte — same length, different value. + bad_sig[0] ^= 0x01; + let cmd = SignedCommand { + session_token: "tok".to_string(), + sequence_number: 1, + kind: OperatorCommandKind::ConfirmPoi, + payload, + signature: bad_sig, + issued_at_wallclock: Utc::now(), + command_id: Uuid::new_v4(), + }; + + // Act + let r = v.validate(cmd); + + // Assert + assert_eq!(r.unwrap_err(), AuthError::SignatureInvalid); + assert_eq!(v.counters().reason(AuthError::SignatureInvalid), 1); + } +} diff --git a/crates/operator_bridge/src/internal/mod.rs b/crates/operator_bridge/src/internal/mod.rs new file mode 100644 index 0000000..b0f820f --- /dev/null +++ b/crates/operator_bridge/src/internal/mod.rs @@ -0,0 +1,4 @@ +//! Internal modules for `operator_bridge`. Not part of the public API. + +pub mod auth; +pub mod poi_surface; diff --git a/crates/operator_bridge/src/internal/poi_surface.rs b/crates/operator_bridge/src/internal/poi_surface.rs new file mode 100644 index 0000000..e98f3ff --- /dev/null +++ b/crates/operator_bridge/src/internal/poi_surface.rs @@ -0,0 +1,323 @@ +//! AZ-679 — POI surface event mapping + dequeue emission. +//! +//! `PoiSurfaceMapper::map(poi)` produces the +//! [`OperatorPoiEvent`](shared::models::operator_event::OperatorPoiEvent) +//! that the operator UI consumes (per `architecture.md §7.10` and the +//! task spec's field list). On queue rotation / age-out / completion +//! `emit_dequeued` produces a `PoiDequeued` event. +//! +//! Both events are pushed through `TelemetrySink::push_operator_event` +//! — composition root supplies the sink (in production, the +//! `telemetry_stream::TelemetryStreamHandle`). +//! +//! `pois_surfaced_per_min` counter exposed via [`PoiSurfaceMetrics`]. + +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +use chrono::Utc; +use parking_lot::Mutex; + +use shared::contracts::TelemetrySink; +use shared::error::{AutopilotError, Result}; +use shared::models::operator_event::{ + DequeueReason, OperatorEvent, OperatorPoiEvent, PhotoMetadata, PoiDequeued, + Tier2EvidenceSummary, +}; +use shared::models::poi::{Poi, VlmPipelineStatus}; +use uuid::Uuid; + +/// Sliding 60 s window over POI-surfaced timestamps. Used by the +/// `pois_surfaced_per_min` health metric. +#[derive(Default)] +struct SurfaceRateWindow { + timestamps: Mutex>, +} + +impl SurfaceRateWindow { + fn record_and_count(&self) -> usize { + let now = std::time::Instant::now(); + let mut w = self.timestamps.lock(); + w.push_back(now); + while let Some(&t) = w.front() { + if now.duration_since(t) > std::time::Duration::from_secs(60) { + w.pop_front(); + } else { + break; + } + } + w.len() + } + + fn current_rate(&self) -> usize { + let now = std::time::Instant::now(); + let mut w = self.timestamps.lock(); + while let Some(&t) = w.front() { + if now.duration_since(t) > std::time::Duration::from_secs(60) { + w.pop_front(); + } else { + break; + } + } + w.len() + } +} + +#[derive(Debug, Clone)] +pub struct PoiSurfaceMetrics { + pub pois_surfaced_per_min: usize, + pub pois_surfaced_total: u64, + pub pois_dequeued_total: u64, +} + +pub struct PoiSurfaceMapper { + sink: Arc, + pois_surfaced_total: AtomicU64, + pois_dequeued_total: AtomicU64, + rate: SurfaceRateWindow, +} + +impl PoiSurfaceMapper { + pub fn new(sink: Arc) -> Self { + Self { + sink, + pois_surfaced_total: AtomicU64::new(0), + pois_dequeued_total: AtomicU64::new(0), + rate: SurfaceRateWindow::default(), + } + } + + /// Pure mapping — produces the wire-format event. Used by tests + /// and by [`surface`] (which also pushes through the sink). + /// Photo metadata is optional and may be supplied by the caller + /// when the POI's source detection has a captured ROI snapshot; + /// the `Poi` model itself does not carry photo bytes. + pub fn map(poi: &Poi, photo_metadata: Option) -> OperatorPoiEvent { + let tier2_evidence_summary = poi.tier2_evidence.as_ref().map(|t| Tier2EvidenceSummary { + path_freshness: t.path_freshness, + endpoint_score: t.endpoint_score, + concealment_score: t.concealment_score, + recommended_next_action: t.recommended_next_action, + status: t.status, + }); + + let vlm_label = match poi.vlm_status { + // The Poi model does not carry the VLM label string — only + // the pipeline status. The label is attached upstream + // when the assessment lands in scan_controller; for now + // we surface None and let scan_controller pass the label + // through a richer overload once AZ-684 wires it. + VlmPipelineStatus::Ok => None, + _ => None, + }; + + OperatorPoiEvent { + poi_id: poi.id, + mgrs: poi.mgrs.clone(), + class_group: poi.class_group.clone(), + confidence: poi.confidence, + vlm_status: poi.vlm_status, + vlm_label, + tier2_evidence_summary, + photo_metadata, + deadline_unix_ms: poi.deadline.timestamp_millis(), + } + } + + /// Map + push. Returns the wire event so the caller can also + /// attach it to the audit log if needed. + pub async fn surface( + &self, + poi: &Poi, + photo_metadata: Option, + ) -> Result { + let event = Self::map(poi, photo_metadata); + self.sink + .push_operator_event(OperatorEvent::PoiSurfaced(event.clone())) + .await + .map_err(|e| AutopilotError::Internal(format!("push_operator_event(poi): {e}")))?; + self.pois_surfaced_total.fetch_add(1, Ordering::Relaxed); + self.rate.record_and_count(); + Ok(event) + } + + /// Emit a `PoiDequeued` event. Called by `scan_controller` (via + /// `operator_bridge`) when a POI is rotated, ages out, or + /// completes (operator decided). + pub async fn emit_dequeued(&self, poi_id: Uuid, reason: DequeueReason) -> Result<()> { + let event = PoiDequeued { + poi_id, + reason, + dequeued_at: Utc::now(), + }; + self.sink + .push_operator_event(OperatorEvent::PoiDequeued(event)) + .await + .map_err(|e| AutopilotError::Internal(format!("push_operator_event(dequeue): {e}")))?; + self.pois_dequeued_total.fetch_add(1, Ordering::Relaxed); + Ok(()) + } + + pub fn metrics(&self) -> PoiSurfaceMetrics { + PoiSurfaceMetrics { + pois_surfaced_per_min: self.rate.current_rate(), + pois_surfaced_total: self.pois_surfaced_total.load(Ordering::Relaxed), + pois_dequeued_total: self.pois_dequeued_total.load(Ordering::Relaxed), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use chrono::{Duration, Utc}; + use shared::models::detection::DetectionBatch; + use shared::models::frame::Frame; + use shared::models::tier2::{RecommendedNextAction, Tier2Evidence, Tier2Status}; + + /// Recording sink that captures every operator event pushed to it. + /// Lets tests assert on the exact wire content without spinning + /// up a real gRPC server. + #[derive(Default, Clone)] + struct RecordingSink { + events: Arc>>, + } + + #[async_trait] + impl TelemetrySink for RecordingSink { + async fn push_frame(&self, _frame: Frame) -> Result<()> { + Ok(()) + } + async fn push_detections(&self, _batch: DetectionBatch) -> Result<()> { + Ok(()) + } + async fn push_operator_event(&self, event: OperatorEvent) -> Result<()> { + self.events.lock().push(event); + Ok(()) + } + } + + fn poi_with_full_evidence() -> Poi { + Poi { + id: Uuid::new_v4(), + confidence: 0.92, + mgrs: "33UWP05".to_string(), + class: "tank".to_string(), + class_group: "vehicle".to_string(), + source_detection_ids: vec![Uuid::new_v4()], + enqueued_at: Utc::now(), + priority: 0.92, + decline_suppressed: false, + vlm_status: VlmPipelineStatus::Ok, + tier2_evidence: Some(Tier2Evidence { + roi_id: Uuid::new_v4(), + path_freshness: Some(0.7), + endpoint_score: Some(0.5), + concealment_score: Some(0.3), + recommended_next_action: RecommendedNextAction::HoldEndpoint, + source_detections: vec![], + status: Tier2Status::Ok, + }), + deadline: Utc::now() + Duration::seconds(120), + } + } + + fn poi_vlm_disabled() -> Poi { + Poi { + vlm_status: VlmPipelineStatus::Disabled, + tier2_evidence: None, + ..poi_with_full_evidence() + } + } + + /// AC-1 — full POI maps with every required field populated; the + /// optional `tier2_evidence_summary` is present when input has it. + #[test] + fn ac1_full_poi_maps_all_required_fields() { + // Arrange + let poi = poi_with_full_evidence(); + let meta = PhotoMetadata { + photo_ref: "snap/123.jpg".to_string(), + width: 1920, + height: 1080, + captured_at_unix_ms: 1_700_000_000_000, + }; + + // Act + let evt = PoiSurfaceMapper::map(&poi, Some(meta.clone())); + + // Assert + assert_eq!(evt.poi_id, poi.id); + assert_eq!(evt.mgrs, "33UWP05"); + assert_eq!(evt.class_group, "vehicle"); + assert!((evt.confidence - 0.92).abs() < 1e-6); + assert_eq!(evt.vlm_status, VlmPipelineStatus::Ok); + let tier2 = evt + .tier2_evidence_summary + .as_ref() + .expect("Tier2 evidence should be carried through"); + assert_eq!( + tier2.recommended_next_action, + RecommendedNextAction::HoldEndpoint + ); + assert_eq!(tier2.status, Tier2Status::Ok); + assert_eq!( + evt.photo_metadata.as_ref().map(|p| &p.photo_ref), + Some(&meta.photo_ref) + ); + assert_eq!(evt.deadline_unix_ms, poi.deadline.timestamp_millis()); + } + + /// AC-2 — VLM-disabled POIs map to vlm_status = Disabled and + /// vlm_label = None. + #[test] + fn ac2_vlm_disabled_carries_explicit_status() { + // Arrange + let poi = poi_vlm_disabled(); + + // Act + let evt = PoiSurfaceMapper::map(&poi, None); + + // Assert + assert_eq!(evt.vlm_status, VlmPipelineStatus::Disabled); + assert!(evt.vlm_label.is_none()); + // tier2 absence preserved. + assert!(evt.tier2_evidence_summary.is_none()); + assert!(evt.photo_metadata.is_none()); + } + + /// AC-3 — Dequeue path emits a PoiDequeued event with the + /// configured reason and the supplied poi_id. + #[tokio::test] + async fn ac3_dequeue_emits_event_through_sink() { + // Arrange + let sink = RecordingSink::default(); + let captured = Arc::clone(&sink.events); + let mapper = PoiSurfaceMapper::new(Arc::new(sink)); + let poi = poi_with_full_evidence(); + + // Act — surface, then dequeue. + mapper.surface(&poi, None).await.unwrap(); + mapper + .emit_dequeued(poi.id, DequeueReason::Rotated) + .await + .unwrap(); + + // Assert — sink saw both events in order. + let events = captured.lock().clone(); + assert_eq!(events.len(), 2); + assert!(matches!(events[0], OperatorEvent::PoiSurfaced(_))); + match &events[1] { + OperatorEvent::PoiDequeued(d) => { + assert_eq!(d.poi_id, poi.id); + assert_eq!(d.reason, DequeueReason::Rotated); + } + _ => panic!("second event must be PoiDequeued"), + } + let m = mapper.metrics(); + assert_eq!(m.pois_surfaced_total, 1); + assert_eq!(m.pois_dequeued_total, 1); + assert_eq!(m.pois_surfaced_per_min, 1); + } +} diff --git a/crates/operator_bridge/src/lib.rs b/crates/operator_bridge/src/lib.rs index 673620f..c5e5da9 100644 --- a/crates/operator_bridge/src/lib.rs +++ b/crates/operator_bridge/src/lib.rs @@ -1,22 +1,38 @@ //! `operator_bridge` — POI surfacing + operator command authentication. //! +//! Real implementation in this batch: +//! - **AZ-678** `internal::auth::HmacOperatorValidator` — HMAC-SHA256 +//! over `(session_token, sequence_number, payload)`; per-session +//! replay tracker; session registry with TTL; rejection-reason +//! counters; sliding-window red-health gate. +//! - **AZ-679** `internal::poi_surface::PoiSurfaceMapper` — wire-format +//! POI events + `PoiDequeued` events pushed through `TelemetrySink`. +//! //! Real implementation lands in: -//! - AZ-678 `operator_bridge_command_auth` -//! - AZ-679 `operator_bridge_poi_surface` //! - AZ-680 `operator_bridge_command_dispatch` //! - AZ-681 `operator_bridge_safety_and_bit_ack` +pub mod internal; + +use std::sync::Arc; + use async_trait::async_trait; use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; -use shared::contracts::OperatorCommandSink; +use shared::contracts::{OperatorCommandSink, TelemetrySink}; use shared::error::{AutopilotError, Result}; -use shared::health::ComponentHealth; +use shared::health::{ComponentHealth, HealthLevel}; use shared::models::mission::Coordinate; use shared::models::operator::OperatorCommand; +use shared::models::operator_event::{DequeueReason, PhotoMetadata}; use shared::models::poi::Poi; +pub use crate::internal::auth::{ + AuthCounters, HmacOperatorValidator, HmacValidatorConfig, REJECTION_REASONS, +}; +pub use crate::internal::poi_surface::{PoiSurfaceMapper, PoiSurfaceMetrics}; + const NAME: &str = "operator_bridge"; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -46,6 +62,15 @@ pub struct OperatorBridge { target_follow_tx: mpsc::Sender, middle_waypoint_rx: Option>, target_follow_rx: Option>, + /// AZ-679 — POI surface mapper. Optional so existing single-arg + /// constructors (used by tests + early scaffolding) keep working; + /// composition root wires the real `TelemetrySink` via + /// `with_telemetry_sink`. + poi_mapper: Option>, + /// AZ-678 — operator command validator. Same optional-pattern as + /// `poi_mapper` so legacy callers continue to compile until the + /// composition root wires it in. + validator: Option>, } impl OperatorBridge { @@ -57,13 +82,27 @@ impl OperatorBridge { target_follow_tx: tf_tx, middle_waypoint_rx: Some(mw_rx), target_follow_rx: Some(tf_rx), + poi_mapper: None, + validator: None, } } + pub fn with_telemetry_sink(mut self, sink: Arc) -> Self { + self.poi_mapper = Some(Arc::new(PoiSurfaceMapper::new(sink))); + self + } + + pub fn with_validator(mut self, validator: Arc) -> Self { + self.validator = Some(validator); + self + } + pub fn handle(&self) -> OperatorBridgeHandle { OperatorBridgeHandle { middle_waypoint_tx: self.middle_waypoint_tx.clone(), target_follow_tx: self.target_follow_tx.clone(), + poi_mapper: self.poi_mapper.clone(), + validator: self.validator.clone(), } } @@ -82,17 +121,79 @@ pub struct OperatorBridgeHandle { middle_waypoint_tx: mpsc::Sender, #[allow(dead_code)] target_follow_tx: mpsc::Sender, + poi_mapper: Option>, + validator: Option>, } impl OperatorBridgeHandle { - pub async fn surface_poi(&self, _poi: Poi) -> Result { - Err(AutopilotError::NotImplemented( - "operator_bridge::surface_poi (AZ-679)", - )) + /// AZ-679 — surface a POI to the operator and await the decision. + /// Today returns `NotImplemented` (the decision loop is AZ-680); + /// the surface event itself IS pushed (via the configured + /// `TelemetrySink`), so the operator UI receives it. + pub async fn surface_poi(&self, poi: Poi) -> Result { + match &self.poi_mapper { + Some(mapper) => { + mapper.surface(&poi, None).await?; + Err(AutopilotError::NotImplemented( + "operator_bridge::surface_poi → decision loop (AZ-680)", + )) + } + None => Err(AutopilotError::NotImplemented( + "operator_bridge::surface_poi (no telemetry sink wired)", + )), + } + } + + /// AZ-679 — surface a POI together with photo metadata (preferred + /// path when the source detection carries an ROI snapshot). + pub async fn surface_poi_with_photo( + &self, + poi: &Poi, + photo_metadata: PhotoMetadata, + ) -> Result<()> { + let mapper = self.poi_mapper.as_ref().ok_or_else(|| { + AutopilotError::Internal("surface_poi_with_photo: telemetry sink not wired".into()) + })?; + mapper.surface(poi, Some(photo_metadata)).await.map(|_| ()) + } + + /// AZ-679 — emit a `PoiDequeued` event (rotation / age-out / + /// completion). Called by `scan_controller` through the bridge. + pub async fn emit_poi_dequeued(&self, poi_id: uuid::Uuid, reason: DequeueReason) -> Result<()> { + let mapper = self.poi_mapper.as_ref().ok_or_else(|| { + AutopilotError::Internal("emit_poi_dequeued: telemetry sink not wired".into()) + })?; + mapper.emit_dequeued(poi_id, reason).await + } + + pub fn poi_metrics(&self) -> Option { + self.poi_mapper.as_ref().map(|m| m.metrics()) } pub fn health(&self) -> ComponentHealth { - ComponentHealth::disabled(NAME) + let mut h = ComponentHealth::disabled(NAME); + if self.poi_mapper.is_none() && self.validator.is_none() { + return h; + } + // Once any sub-component is wired we surface green by default, + // upgrade to red if the validator's signature-failure window + // crosses the threshold (AC-5). + h.level = HealthLevel::Green; + if let Some(v) = &self.validator { + if v.health_is_red() { + h.level = HealthLevel::Red; + } + let c = v.counters(); + h.detail = Some(format!( + "validated_total={} sig_invalid={} replay={} session_unknown={} session_expired={}", + c.validated_total(), + c.reason(shared::contracts::operator_auth::AuthError::SignatureInvalid), + c.reason(shared::contracts::operator_auth::AuthError::ReplayDetected), + c.reason(shared::contracts::operator_auth::AuthError::SessionUnknown), + c.reason(shared::contracts::operator_auth::AuthError::SessionExpired), + )); + } + h } } @@ -110,8 +211,22 @@ mod tests { use super::*; #[test] - fn it_compiles() { + fn it_compiles_without_wiring() { let h = OperatorBridge::new(8).handle(); assert_eq!(h.health().level, shared::health::HealthLevel::Disabled); } + + #[test] + fn health_green_once_validator_wired() { + // Arrange + let validator = Arc::new(HmacOperatorValidator::with_default_config()); + + // Act + let bridge = OperatorBridge::new(8).with_validator(validator); + let h = bridge.handle().health(); + + // Assert + assert_eq!(h.level, shared::health::HealthLevel::Green); + assert!(h.detail.unwrap().contains("validated_total=0")); + } } diff --git a/crates/shared/src/contracts/mod.rs b/crates/shared/src/contracts/mod.rs index 2bd628a..a68511e 100644 --- a/crates/shared/src/contracts/mod.rs +++ b/crates/shared/src/contracts/mod.rs @@ -4,12 +4,15 @@ //! importing the receiving crate. The composition root in //! `crates/autopilot/src/runtime.rs` wires concrete implementations. +pub mod operator_auth; + use async_trait::async_trait; use crate::error::Result; use crate::models::detection::DetectionBatch; use crate::models::frame::Frame; use crate::models::operator::OperatorCommand; +use crate::models::operator_event::OperatorEvent; use crate::models::vlm::VlmAssessment; /// Telemetry uplink. Implemented by `telemetry_stream`, consumed by @@ -19,6 +22,11 @@ use crate::models::vlm::VlmAssessment; pub trait TelemetrySink: Send + Sync { async fn push_frame(&self, frame: Frame) -> Result<()>; async fn push_detections(&self, batch: DetectionBatch) -> Result<()>; + + /// AZ-679 — push a POI surface event (or its dequeue event) to + /// the operator. The receiving impl serialises onto the + /// appropriate operator-bound topic. + async fn push_operator_event(&self, event: OperatorEvent) -> Result<()>; } /// MAVLink command surface. Implemented by `mavlink_layer`, consumed by diff --git a/crates/shared/src/contracts/operator_auth.rs b/crates/shared/src/contracts/operator_auth.rs new file mode 100644 index 0000000..15bb682 --- /dev/null +++ b/crates/shared/src/contracts/operator_auth.rs @@ -0,0 +1,99 @@ +//! AZ-678 — operator command authentication contract. +//! +//! `OperatorCommandValidator` is the boundary every operator-bound +//! command crosses before any business logic runs. The default +//! implementation (`HmacOperatorValidator` in +//! `operator_bridge::internal::auth`) uses HMAC-SHA256 over +//! `(session_token || sequence_number || payload_bytes)`. The trait +//! lives here so the dispatch surface (`scan_controller`, +//! `mission_executor`) can depend on the contract without importing +//! `operator_bridge`. + +use thiserror::Error; + +use crate::models::operator::{OperatorCommand, OperatorCommandKind}; + +/// A command as it arrives over the operator-link, prior to any +/// authentication. Mirrors the validated `OperatorCommand` shape +/// closely so a successful `validate` is a near-identity transform. +#[derive(Debug, Clone)] +pub struct SignedCommand { + pub session_token: String, + pub sequence_number: u64, + pub kind: OperatorCommandKind, + pub payload: serde_json::Value, + /// HMAC over `(session_token || sequence_number || canonical + /// JSON of payload)`. Length depends on the scheme; for HMAC-SHA256 + /// this is exactly 32 bytes. + pub signature: Vec, + /// Wall-clock time the Ground Station stamped the command. Carried + /// through `validate` for downstream audit logging; not used by + /// the auth check itself. + pub issued_at_wallclock: chrono::DateTime, + pub command_id: uuid::Uuid, +} + +impl SignedCommand { + /// Convert into a canonical [`OperatorCommand`] once validation + /// has succeeded. The signature is retained on the result for + /// downstream audit logging. + pub fn into_command(self) -> OperatorCommand { + OperatorCommand { + command_id: self.command_id, + session_token: self.session_token, + sequence_number: self.sequence_number, + issued_at_wallclock: self.issued_at_wallclock, + kind: self.kind, + payload: self.payload, + signature: self.signature, + } + } +} + +/// Validated command. Returned by `OperatorCommandValidator::validate` +/// on the happy path. Holding a `ValidatedCommand` is the proof that +/// dispatching the inner command is safe. +#[derive(Debug, Clone)] +pub struct ValidatedCommand { + pub command: OperatorCommand, +} + +/// Why an operator command was rejected. Each variant maps 1-1 to a +/// `auth_rejections_total{reason}` metric counter and to a structured +/// log line. Order MUST match +/// `operator_bridge::internal::auth::REJECTION_REASONS` for the +/// counter array layout. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Error)] +pub enum AuthError { + #[error("signature does not match computed HMAC")] + SignatureInvalid, + #[error("replay detected — sequence number not greater than last seen")] + ReplayDetected, + #[error("session token unknown or never established")] + SessionUnknown, + #[error("session token expired (TTL elapsed)")] + SessionExpired, +} + +impl AuthError { + /// Stable kebab-case label for the rejection-reason metric. + pub fn reason_label(&self) -> &'static str { + match self { + Self::SignatureInvalid => "signature_invalid", + Self::ReplayDetected => "replay_detected", + Self::SessionUnknown => "session_unknown", + Self::SessionExpired => "session_expired", + } + } +} + +/// Contract every operator-command validator must satisfy. The +/// default `HmacOperatorValidator` lives in `operator_bridge`; other +/// schemes (e.g. Q9 resolution to a JWS-based one) implement the +/// same trait and can be swapped behind the same callsite. +pub trait OperatorCommandValidator: Send + Sync { + /// Validate one signed command. On `Ok`, the per-session + /// sequence-number tracker advances; on `Err`, it does NOT + /// advance (so the rejected `seq` does not poison the session). + fn validate(&self, cmd: SignedCommand) -> Result; +} diff --git a/crates/shared/src/models/mod.rs b/crates/shared/src/models/mod.rs index d4ee07b..e4b37fc 100644 --- a/crates/shared/src/models/mod.rs +++ b/crates/shared/src/models/mod.rs @@ -10,6 +10,7 @@ pub mod mapobject; pub mod mission; pub mod movement; pub mod operator; +pub mod operator_event; pub mod poi; pub mod telemetry; pub mod tier2; diff --git a/crates/shared/src/models/operator_event.rs b/crates/shared/src/models/operator_event.rs new file mode 100644 index 0000000..5456b16 --- /dev/null +++ b/crates/shared/src/models/operator_event.rs @@ -0,0 +1,144 @@ +//! AZ-679 — operator-bound POI surface events. +//! +//! Wire shape that `operator_bridge` produces from a `Poi` and pushes +//! through `telemetry_stream` to the Ground Station. Fields follow +//! `architecture.md §7.10 Drone ⇄ Operator Sync Message Format` and +//! the AZ-679 task spec. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use super::poi::VlmPipelineStatus; +use super::tier2::{RecommendedNextAction, Tier2Status}; + +/// Tier-2 evidence summary as carried to the operator. We do not +/// expose internal ROI identifiers or source-detection UUIDs — the +/// operator only needs the scored summary and the recommended next +/// action. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Tier2EvidenceSummary { + #[serde(skip_serializing_if = "Option::is_none")] + pub path_freshness: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub endpoint_score: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub concealment_score: Option, + pub recommended_next_action: RecommendedNextAction, + pub status: Tier2Status, +} + +/// Photo metadata carried with every POI per `architecture.md §7.10`. +/// Optional because some POIs (e.g. movement-only with no ROI crop) +/// may not have a photo yet. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct PhotoMetadata { + pub photo_ref: String, + pub width: u32, + pub height: u32, + pub captured_at_unix_ms: i64, +} + +/// Wire-format POI surface message — what the operator's UI consumes. +/// +/// `vlm_label` is `Some` only when `vlm_status == Ok`. For +/// `Disabled` / `NotRequested` etc. the operator receives the status +/// alone and renders accordingly (AC-2 in the task spec). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OperatorPoiEvent { + pub poi_id: Uuid, + pub mgrs: String, + pub class_group: String, + pub confidence: f32, + pub vlm_status: VlmPipelineStatus, + #[serde(skip_serializing_if = "Option::is_none")] + pub vlm_label: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tier2_evidence_summary: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub photo_metadata: Option, + pub deadline_unix_ms: i64, +} + +/// Why a POI was removed from the surfaced queue. Operator UIs use +/// this to distinguish "operator hit deadline" from "queue rotated +/// to make room for a higher-confidence POI". +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum DequeueReason { + /// Decision-window deadline elapsed without operator input. + Aged, + /// Operator decided (confirmed / declined / target-follow). + Completed, + /// Queue rotated (higher-confidence or higher-priority POI took + /// the slot). + Rotated, +} + +/// Emitted by `operator_bridge` whenever a previously-surfaced POI +/// leaves the queue. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PoiDequeued { + pub poi_id: Uuid, + pub reason: DequeueReason, + pub dequeued_at: DateTime, +} + +/// Tagged enum the composition root pushes through +/// `TelemetrySink::push_operator_event`. The discriminator on the +/// wire is `"kind": "poi_surfaced" | "poi_dequeued"`. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum OperatorEvent { + PoiSurfaced(OperatorPoiEvent), + PoiDequeued(PoiDequeued), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn operator_event_serde_roundtrip_poi_surfaced() { + // Arrange + let evt = OperatorEvent::PoiSurfaced(OperatorPoiEvent { + poi_id: Uuid::nil(), + mgrs: "33UWP01".to_string(), + class_group: "vehicle".to_string(), + confidence: 0.82, + vlm_status: VlmPipelineStatus::Disabled, + vlm_label: None, + tier2_evidence_summary: None, + photo_metadata: None, + deadline_unix_ms: 1_700_000_000_000, + }); + + // Act + let s = serde_json::to_string(&evt).unwrap(); + let back: OperatorEvent = serde_json::from_str(&s).unwrap(); + + // Assert + assert!(matches!(back, OperatorEvent::PoiSurfaced(_))); + assert!(s.contains("\"kind\":\"poi_surfaced\"")); + assert!(s.contains("\"vlm_status\":\"disabled\"")); + } + + #[test] + fn operator_event_serde_roundtrip_dequeued() { + // Arrange + let evt = OperatorEvent::PoiDequeued(PoiDequeued { + poi_id: Uuid::nil(), + reason: DequeueReason::Aged, + dequeued_at: Utc::now(), + }); + + // Act + let s = serde_json::to_string(&evt).unwrap(); + let back: OperatorEvent = serde_json::from_str(&s).unwrap(); + + // Assert + assert!(matches!(back, OperatorEvent::PoiDequeued(_))); + assert!(s.contains("\"kind\":\"poi_dequeued\"")); + assert!(s.contains("\"reason\":\"aged\"")); + } +} diff --git a/crates/telemetry_stream/Cargo.toml b/crates/telemetry_stream/Cargo.toml index 6f50c37..abca723 100644 --- a/crates/telemetry_stream/Cargo.toml +++ b/crates/telemetry_stream/Cargo.toml @@ -10,6 +10,7 @@ build = "build.rs" [dependencies] shared = { workspace = true } +bytes = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } diff --git a/crates/telemetry_stream/proto/telemetry.proto b/crates/telemetry_stream/proto/telemetry.proto index c50584d..ad161a1 100644 --- a/crates/telemetry_stream/proto/telemetry.proto +++ b/crates/telemetry_stream/proto/telemetry.proto @@ -1,17 +1,20 @@ // AZ-675 telemetry_stream — operator-bound gRPC contract. // -// One service, one bi-directional Subscribe RPC. Client opens a stream -// declaring which topics it wants; server pushes messages for those -// topics until the client disconnects. +// One Subscribe RPC multiplexes structured topics (telemetry, gimbal, +// detection, movement, MapObjects). Video is carried by a dedicated +// SubscribeVideo RPC because frame payloads are binary, large, and +// don't share the JSON-broadcast model the structured topics use. // -// The server enforces per-client back-pressure: when a client cannot -// keep up the oldest message in *that client's* queue is dropped and -// a per-(client, topic) drop counter is incremented. Other clients -// are unaffected. +// The Subscribe server enforces per-client drop-oldest back-pressure +// for the structured topics; SubscribeVideo applies the same back- +// pressure to the bytes_inline frame queue when the operator client +// cannot keep up. // -// AZ-676 will add the video path (separate RPC, server-streamed binary -// frames). AZ-677 will add the MapObjectsBundle snapshot RPC. Keep -// those concerns out of this contract. +// MapObjectsBundle (topic on Subscribe) is special: on subscribe the +// server first emits a Snapshot variant of MapObjectsBundleMessage +// and then forwards Diff variants for in-flight changes. Reconnect +// is treated as a new subscribe — a fresh Snapshot is emitted and +// diffs accumulated during the disconnect are NOT replayed. syntax = "proto3"; @@ -26,6 +29,9 @@ enum Topic { TOPIC_DETECTION_EVENT = 3; TOPIC_MOVEMENT_CANDIDATE = 4; TOPIC_MAP_OBJECTS_BUNDLE = 5; + // AZ-679 — operator-bound POI events (surfaced + dequeued). JSON + // payload is a tagged enum (`kind: poi_surfaced | poi_dequeued`). + TOPIC_OPERATOR_EVENT = 6; } message SubscribeRequest { @@ -55,10 +61,74 @@ message TelemetryMessage { bytes payload_json = 4; } +// Pixel format enum mirroring `shared::models::frame::PixelFormat`. +// Only used by VideoFrame (bytes_inline mode). +enum PixelFormat { + PIXEL_FORMAT_UNSPECIFIED = 0; + PIXEL_FORMAT_NV12 = 1; + PIXEL_FORMAT_YUV420P = 2; + PIXEL_FORMAT_RGB24 = 3; +} + +// Operator-bound video delivery mode. Per AZ-676 the autopilot is +// configured at startup to either forward the RTSP URL straight to +// the operator (lower onboard cost; default) or carry encoded bytes +// over this gRPC stream. +enum VideoMode { + VIDEO_MODE_UNSPECIFIED = 0; + VIDEO_MODE_RTSP_FORWARD = 1; + VIDEO_MODE_BYTES_INLINE = 2; +} + +message SubscribeVideoRequest { + // Operator/client identifier — plumbed into the ai_locked session + // counter, drop counters, and log lines. + string client_id = 1; +} + +// First message every SubscribeVideo stream emits. Tells the operator +// which mode the autopilot is configured in and, for rtsp_forward, +// the URL the operator should pull from. +message VideoSessionStart { + VideoMode mode = 1; + // Populated iff `mode == VIDEO_MODE_RTSP_FORWARD`. + string rtsp_url = 2; +} + +// Encoded video frame (one decoded image from frame_ingest). Emitted +// only when `mode == VIDEO_MODE_BYTES_INLINE`. +message VideoFrame { + uint64 seq = 1; + uint64 monotonic_ts_ns = 2; + uint32 width = 3; + uint32 height = 4; + PixelFormat pix_fmt = 5; + bytes pixels = 6; +} + +// Server-streamed messages on SubscribeVideo. Exactly one start +// message is always sent first, followed by zero or more frames +// (bytes_inline mode only). +message VideoMessage { + oneof kind { + VideoSessionStart start = 1; + VideoFrame frame = 2; + } +} + service TelemetryStream { // Server-streaming subscribe. The client sends ONE SubscribeRequest; // the server pushes TelemetryMessage values until the client cancels // the stream or the server shuts down. The server applies per- // client drop-oldest back-pressure if the client cannot keep up. rpc Subscribe(SubscribeRequest) returns (stream TelemetryMessage); + + // AZ-676 operator video path. The first message on every stream is + // a VideoSessionStart describing the configured delivery mode; in + // rtsp_forward mode no further messages are sent until disconnect. + // In bytes_inline mode the server forwards frames published by + // frame_ingest with the same per-client drop-oldest back-pressure + // as Subscribe (a slow operator loses frames on its own stream + // without affecting other clients or the AI pipeline). + rpc SubscribeVideo(SubscribeVideoRequest) returns (stream VideoMessage); } diff --git a/crates/telemetry_stream/src/internal/mapobjects.rs b/crates/telemetry_stream/src/internal/mapobjects.rs new file mode 100644 index 0000000..621b94b --- /dev/null +++ b/crates/telemetry_stream/src/internal/mapobjects.rs @@ -0,0 +1,178 @@ +//! AZ-677 — MapObjectsBundle snapshot + in-flight diff stream. +//! +//! Pattern: every operator client that subscribes to +//! `Topic::MapObjectsBundle` first receives one +//! [`MapObjectsTopicMessage::Snapshot`] built from the configured +//! [`MapObjectsSnapshotSource`], and then receives +//! [`MapObjectsTopicMessage::Diff`] messages for every append the +//! composition root publishes via +//! [`crate::TelemetryStreamHandle::push_mapobjects_diff`]. On +//! reconnect, the client is treated as a fresh subscriber: it gets a +//! brand new snapshot — diffs that were broadcast during the gap are +//! NOT replayed (per AZ-677 spec — best-effort replay creates +//! consistency hazards). +//! +//! The snapshot source lives outside `telemetry_stream` (composition +//! root supplies an `Arc` that adapts +//! `mapobjects_store::MapObjectsStore::snapshot()`). The diff +//! publishing side is fed by the same composition root, which +//! subscribes to the store's append log and forwards each entry as +//! `push_mapobjects_diff(diff)`. + +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +use shared::models::mapobject::{IgnoredItem, MapObject, MapObjectObservation, MapObjectsBundle}; + +/// Wire shape of a diff message. Mirrors `data_model.md §MapObjectsDiff` +/// (added observations, moved observations, removed candidates, newly +/// ignored items). Empty vectors are valid — the publisher may emit a +/// diff with only one populated bucket. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct MapObjectsDiff { + #[serde(default)] + pub added: Vec, + #[serde(default)] + pub moved: Vec, + #[serde(default)] + pub removed_candidates: Vec, + #[serde(default)] + pub ignored: Vec, +} + +/// Wire shape of the initial snapshot. Re-exposes the canonical +/// `MapObjectsBundle` payload — no transformation, just a tag so the +/// operator can tell snapshot from diff on the same topic. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MapObjectsBundleSnapshot { + pub bundle: MapObjectsBundle, +} + +/// Tagged enum carried as the JSON payload on every +/// `Topic::MapObjectsBundle` message. The discriminator is +/// `"kind": "snapshot" | "diff"` so the operator deserialises with a +/// `serde(tag = "kind")` adjacent-tagging. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum MapObjectsTopicMessage { + Snapshot(MapObjectsBundleSnapshot), + Diff(MapObjectsDiff), +} + +/// Provided by the composition root, implemented in +/// `mapobjects_store` (via a thin adapter). `telemetry_stream` queries +/// this on every fresh MapObjectsBundle subscribe. +/// +/// Implementations MUST be cheap to call concurrently (read-only). +pub trait MapObjectsSnapshotSource: Send + Sync + 'static { + fn snapshot(&self) -> MapObjectsBundle; +} + +/// Fixture impl for tests + the default "no store wired yet" mode. +/// Returns an empty bundle keyed to the supplied `mission_id`. +/// +/// Production code MUST replace this with a real adapter; the empty +/// bundle is acceptable only for unit tests and for the case where +/// the composition root has not finished wiring (a green-field +/// startup race). +pub struct EmptyMapObjectsSource { + pub mission_id: String, +} + +impl MapObjectsSnapshotSource for EmptyMapObjectsSource { + fn snapshot(&self) -> MapObjectsBundle { + use chrono::Utc; + use shared::models::mission::Coordinate; + let zero = Coordinate { + latitude: 0.0, + longitude: 0.0, + altitude_m: 0.0, + }; + MapObjectsBundle { + schema_version: "1.0".to_string(), + mission_id: self.mission_id.clone(), + bbox: [zero, zero], + map_objects: Vec::new(), + observations: Vec::new(), + ignored_items: Vec::new(), + as_of: Utc::now(), + freshness: None, + } + } +} + +/// Type-erased snapshot source — what `TelemetryStream` holds. +pub type SharedSnapshotSource = Arc; + +#[cfg(test)] +mod tests { + use super::*; + use shared::models::mission::Coordinate; + + #[test] + fn topic_message_serde_roundtrip_snapshot() { + // Arrange + let bundle = MapObjectsBundle { + schema_version: "1.0".to_string(), + mission_id: "m1".to_string(), + bbox: [ + Coordinate { + latitude: 0.0, + longitude: 0.0, + altitude_m: 0.0, + }, + Coordinate { + latitude: 1.0, + longitude: 1.0, + altitude_m: 0.0, + }, + ], + map_objects: vec![], + observations: vec![], + ignored_items: vec![], + as_of: chrono::Utc::now(), + freshness: None, + }; + let msg = MapObjectsTopicMessage::Snapshot(MapObjectsBundleSnapshot { bundle }); + + // Act + let s = serde_json::to_string(&msg).unwrap(); + let back: MapObjectsTopicMessage = serde_json::from_str(&s).unwrap(); + + // Assert + assert!(matches!(back, MapObjectsTopicMessage::Snapshot(_))); + assert!(s.contains("\"kind\":\"snapshot\"")); + } + + #[test] + fn topic_message_serde_roundtrip_diff() { + // Arrange + let msg = MapObjectsTopicMessage::Diff(MapObjectsDiff::default()); + + // Act + let s = serde_json::to_string(&msg).unwrap(); + let back: MapObjectsTopicMessage = serde_json::from_str(&s).unwrap(); + + // Assert + assert!(matches!(back, MapObjectsTopicMessage::Diff(_))); + assert!(s.contains("\"kind\":\"diff\"")); + } + + #[test] + fn empty_source_returns_empty_bundle_with_mission_id() { + // Arrange + let src = EmptyMapObjectsSource { + mission_id: "m42".to_string(), + }; + + // Act + let b = src.snapshot(); + + // Assert + assert_eq!(b.mission_id, "m42"); + assert!(b.map_objects.is_empty()); + assert!(b.observations.is_empty()); + assert!(b.ignored_items.is_empty()); + } +} diff --git a/crates/telemetry_stream/src/internal/mod.rs b/crates/telemetry_stream/src/internal/mod.rs index 9d2c167..86fe410 100644 --- a/crates/telemetry_stream/src/internal/mod.rs +++ b/crates/telemetry_stream/src/internal/mod.rs @@ -1,5 +1,8 @@ //! Internal modules for `telemetry_stream`. Not part of the public API. +pub mod mapobjects; pub mod proto; pub mod publisher; pub mod server; +pub mod video; +pub mod video_server; diff --git a/crates/telemetry_stream/src/internal/publisher.rs b/crates/telemetry_stream/src/internal/publisher.rs index 8bdfcbb..367ba02 100644 --- a/crates/telemetry_stream/src/internal/publisher.rs +++ b/crates/telemetry_stream/src/internal/publisher.rs @@ -17,6 +17,9 @@ use serde::Serialize; use tokio::sync::broadcast; use tracing::warn; +use crate::internal::mapobjects::{ + MapObjectsBundleSnapshot, MapObjectsDiff, MapObjectsTopicMessage, SharedSnapshotSource, +}; use crate::internal::proto::{TelemetryMessage, Topic}; /// Per-topic broadcast capacity. A client falling more than this many @@ -34,6 +37,7 @@ pub const ALL_TOPICS: &[Topic] = &[ Topic::DetectionEvent, Topic::MovementCandidate, Topic::MapObjectsBundle, + Topic::OperatorEvent, ]; /// Errors returned by [`TelemetryPublisher::publish`]. Publish never @@ -96,6 +100,20 @@ pub struct TelemetryPublisher { topics: HashMap, drops: DropMap, subscribed_clients: AtomicUsize, + /// AZ-677 — composition-root-supplied snapshot source. Read on + /// every fresh MapObjectsBundle subscribe. + snapshot_source: Mutex>, + /// AZ-677 — `mapobjects_resnap_count` counter. Incremented every + /// time the subscribe handler emits a snapshot (new client OR + /// reconnecting client). + mapobjects_resnap_count: AtomicU64, + /// AZ-677 — `mapobjects_diff_count` counter. Incremented every + /// time `publish_mapobjects_diff` is called. + mapobjects_diff_count: AtomicU64, + /// AZ-677 — cumulative bytes of the most recently serialised + /// snapshot. Updated by `current_snapshot_message()` so the + /// health surface can report bundle weight without re-serialising. + last_snapshot_bytes: AtomicU64, } impl TelemetryPublisher { @@ -111,9 +129,74 @@ impl TelemetryPublisher { topics, drops: Mutex::new(HashMap::new()), subscribed_clients: AtomicUsize::new(0), + snapshot_source: Mutex::new(None), + mapobjects_resnap_count: AtomicU64::new(0), + mapobjects_diff_count: AtomicU64::new(0), + last_snapshot_bytes: AtomicU64::new(0), }) } + /// Composition-root entry point. Wires the + /// `MapObjectsSnapshotSource` (typically an adapter over + /// `mapobjects_store::MapObjectsStore`). Replacing an existing + /// source is allowed (test fixtures use this). + pub fn set_snapshot_source(&self, src: SharedSnapshotSource) { + *self.snapshot_source.lock() = Some(src); + } + + /// AZ-677 — build the snapshot message the subscribe handler must + /// emit before forwarding any diff. Returns `None` when no + /// snapshot source has been wired yet; the subscribe handler then + /// proceeds straight to the diff broadcast (an empty store is the + /// natural cold-start state). + pub(crate) fn current_snapshot_message(&self) -> Option { + let snap_src = self.snapshot_source.lock().as_ref().map(Arc::clone)?; + let bundle = snap_src.snapshot(); + let payload = MapObjectsTopicMessage::Snapshot(MapObjectsBundleSnapshot { bundle }); + let bytes = match serde_json::to_vec(&payload) { + Ok(b) => b, + Err(e) => { + warn!(error = %e, "mapobjects snapshot serialise failed; skipping"); + return None; + } + }; + self.last_snapshot_bytes + .store(bytes.len() as u64, Ordering::Relaxed); + self.mapobjects_resnap_count.fetch_add(1, Ordering::Relaxed); + let topic = Topic::MapObjectsBundle; + let channel = self.topics.get(&topic)?; + let seq = channel.seq.fetch_add(1, Ordering::Relaxed) + 1; + Some(TelemetryMessage { + topic: topic as i32, + monotonic_ts_ns: shared::clock::MonoClock::new().elapsed_ns(), + sequence: seq, + payload_json: bytes, + }) + } + + /// AZ-677 — broadcast a MapObjectsDiff to every active operator + /// subscriber that has the MapObjectsBundle topic in their + /// subscription set. The composition root calls this whenever + /// `mapobjects_store` appends an observation / ignored item. + /// + /// Diffs flow through the existing `Topic::MapObjectsBundle` + /// broadcast channel — discriminated from snapshots by the + /// `"kind": "diff"` tag on the JSON payload. + pub fn publish_mapobjects_diff(&self, diff: MapObjectsDiff) -> Result<(), PublishError> { + let payload = MapObjectsTopicMessage::Diff(diff); + self.publish(Topic::MapObjectsBundle, &payload)?; + self.mapobjects_diff_count.fetch_add(1, Ordering::Relaxed); + Ok(()) + } + + pub fn mapobjects_counters(&self) -> (u64, u64, u64) { + ( + self.mapobjects_resnap_count.load(Ordering::Relaxed), + self.mapobjects_diff_count.load(Ordering::Relaxed), + self.last_snapshot_bytes.load(Ordering::Relaxed), + ) + } + pub fn default_capacity() -> Arc { Self::new(DEFAULT_TOPIC_CAPACITY) } diff --git a/crates/telemetry_stream/src/internal/server.rs b/crates/telemetry_stream/src/internal/server.rs index 49488c5..3189b83 100644 --- a/crates/telemetry_stream/src/internal/server.rs +++ b/crates/telemetry_stream/src/internal/server.rs @@ -23,16 +23,22 @@ use tonic::{Request, Response, Status}; use tracing::{info, warn}; use crate::internal::proto::telemetry_stream_server::TelemetryStream; -use crate::internal::proto::{SubscribeRequest, TelemetryMessage, Topic}; +use crate::internal::proto::{SubscribeRequest, SubscribeVideoRequest, TelemetryMessage, Topic}; use crate::internal::publisher::{TelemetryPublisher, ALL_TOPICS}; +use crate::internal::video::VideoPublisher; +use crate::internal::video_server::{VideoService, VideoStream}; pub struct TelemetryService { publisher: Arc, + video: Arc, } impl TelemetryService { - pub fn new(publisher: Arc) -> Self { - Self { publisher } + pub fn new(publisher: Arc, video_publisher: Arc) -> Self { + Self { + publisher, + video: Arc::new(VideoService::new(video_publisher)), + } } } @@ -41,6 +47,7 @@ type SubscribeStream = Pin Some(Ok(msg)), Err(BroadcastStreamRecvError::Lagged(n)) => { warn!(client_id = %cid, ?topic, dropped = n, "slow client lagged"); @@ -95,6 +117,11 @@ impl TelemetryStream for TelemetryService { } }); + let stream = StartThen { + start: mapobjects_snapshot.map(Ok), + body, + }; + let stream = StreamGuard { inner: stream, publisher: Arc::clone(&self.publisher), @@ -102,6 +129,35 @@ impl TelemetryStream for TelemetryService { Ok(Response::new(Box::pin(stream) as Self::SubscribeStream)) } + + async fn subscribe_video( + &self, + request: Request, + ) -> Result, Status> { + self.video.handle_subscribe(request).await + } +} + +/// AZ-677 — emit `start` once (the MapObjects snapshot), then yield +/// everything from `body`. When `start` is `None` the stream +/// degenerates to `body` with zero overhead. +struct StartThen { + start: Option>, + body: S, +} + +impl Stream for StartThen +where + S: Stream> + Send + Unpin, +{ + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if let Some(msg) = self.start.take() { + return Poll::Ready(Some(msg)); + } + Pin::new(&mut self.body).poll_next(cx) + } } /// Decrement `subscribed_clients` when the per-client outbound diff --git a/crates/telemetry_stream/src/internal/video.rs b/crates/telemetry_stream/src/internal/video.rs new file mode 100644 index 0000000..2168e37 --- /dev/null +++ b/crates/telemetry_stream/src/internal/video.rs @@ -0,0 +1,365 @@ +//! AZ-676 — operator video path. +//! +//! Two delivery modes selected at startup via [`VideoPath`]: +//! - `RtspForward { url }`: the autopilot tells the operator which +//! RTSP URL the camera is publishing on; bytes never traverse this +//! gRPC stream. This is the recommended default (lower onboard +//! cost, no per-frame copy). +//! - `BytesInline`: the operator pulls encoded frames over the +//! `SubscribeVideo` stream. `frame_ingest` publishes each decoded +//! frame here via [`VideoPublisher::publish_frame`]; the per-client +//! stream applies drop-oldest back-pressure identical to the +//! structured `Subscribe` path so a slow operator never blocks +//! `frame_ingest`. +//! +//! ## ai_locked coordination +//! +//! [`VideoPublisher`] owns an `Arc` exposed via +//! [`VideoPublisher::ai_locked_handle`]. The atomic is shared with +//! `frame_ingest` and `detection_client` (composition root wires it +//! into their constructors). The atomic flips: +//! - `false → true` when the first operator subscribes to +//! `SubscribeVideo` (first session join). +//! - `true → false` when the last operator disconnects (last session +//! leave). +//! +//! In `RtspForward` mode the same toggle applies — even though we +//! emit only the URL, the operator is consuming the video path and +//! AI must back off the frame budget. + +use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; + +use parking_lot::Mutex; +use tokio::sync::broadcast; +use tracing::warn; + +use shared::models::frame::{Frame, PixelFormat}; + +/// Server-side per-client outbound broadcast capacity for the +/// bytes_inline frame channel. Frames are large (full-resolution +/// pixel buffers) so the budget is smaller than the structured-topic +/// publisher: ≥1 second of headroom at 30 fps is enough for transient +/// modem stalls without ballooning memory. +pub const DEFAULT_VIDEO_CAPACITY: usize = 32; + +/// Selected at startup. The autopilot's `config.video_path` resolves +/// to one of these. +#[derive(Debug, Clone)] +pub enum VideoPath { + /// Emit the configured RTSP URL on session-start; no bytes flow + /// through this gRPC stream. Operator stacks pull RTSP directly + /// from the camera (most common). + RtspForward { url: String }, + /// Carry encoded bytes over the gRPC stream. Used when the + /// operator cannot reach the camera's RTSP source directly. + BytesInline, +} + +impl Default for VideoPath { + fn default() -> Self { + // The architecture default is rtsp_forward with an empty URL + // placeholder; the composition root must set the real URL + // before binding the server. We choose a sentinel URL so a + // misconfigured deployment surfaces in the operator session- + // start message rather than silently mis-pointing. + Self::RtspForward { + url: "rtsp://unconfigured.invalid/stream".to_string(), + } + } +} + +impl VideoPath { + pub fn mode_label(&self) -> &'static str { + match self { + Self::RtspForward { .. } => "rtsp_forward", + Self::BytesInline => "bytes_inline", + } + } +} + +/// Wire-shaped video frame. We carry exactly what +/// `shared::models::frame::Frame` carries, minus the `ai_locked` +/// flag (it's a control signal, not a per-frame property the +/// operator needs). +/// +/// Pixels are cloned (`Arc` shallow clone — O(1)) into the +/// broadcast channel; downstream the gRPC encode path turns them +/// into the proto `VideoFrame` message. +#[derive(Debug, Clone)] +pub struct VideoFrameMessage { + pub seq: u64, + pub monotonic_ts_ns: u64, + pub width: u32, + pub height: u32, + pub pix_fmt: PixelFormat, + pub pixels: bytes::Bytes, +} + +impl From<&Frame> for VideoFrameMessage { + fn from(f: &Frame) -> Self { + Self { + seq: f.seq, + monotonic_ts_ns: f.decode_ts_monotonic_ns, + width: f.width, + height: f.height, + pix_fmt: f.pix_fmt, + pixels: (*f.pixels).clone(), + } + } +} + +/// Snapshot of video-path health for the +/// [`crate::TelemetryStreamHandle::health`] surface. +#[derive(Debug, Clone)] +pub struct VideoSnapshot { + pub mode: &'static str, + pub ai_locked: bool, + pub video_session_count: usize, + pub published_frames: u64, + pub bytes_inline_drops_total: u64, +} + +pub struct VideoPublisher { + path: VideoPath, + /// Per-client broadcast for bytes_inline mode. Allocated even in + /// rtsp_forward mode so [`publish_frame`] is a cheap no-op (no + /// branch on the hot path beyond the mode check). Subscriber + /// count drives the per-client send. + tx: broadcast::Sender, + ai_locked: Arc, + /// Live operator subscribers to `SubscribeVideo`. The atomic flip + /// is keyed off the transition through zero in either direction. + video_session_count: Arc, + /// Aggregate per-client drops on the video broadcast. Equivalent + /// to `bytes_inline_drops_total` in the AZ-676 health surface. + bytes_inline_drops: Arc, + /// `publish_frame` call count (incremented in both modes; in + /// rtsp_forward it stays 0 because the function returns early). + published_frames: AtomicU64, + drops_per_client: Mutex>, +} + +impl VideoPublisher { + pub fn new(path: VideoPath, capacity: usize) -> Arc { + let (tx, _) = broadcast::channel(capacity); + Arc::new(Self { + path, + tx, + ai_locked: Arc::new(AtomicBool::new(false)), + video_session_count: Arc::new(AtomicUsize::new(0)), + bytes_inline_drops: Arc::new(AtomicU64::new(0)), + published_frames: AtomicU64::new(0), + drops_per_client: Mutex::new(std::collections::HashMap::new()), + }) + } + + pub fn default_capacity(path: VideoPath) -> Arc { + Self::new(path, DEFAULT_VIDEO_CAPACITY) + } + + /// Shared `Arc` siblings (`frame_ingest`, + /// `detection_client`) read at decode/inference time. The atomic + /// is owned by `telemetry_stream`; siblings only read. + pub fn ai_locked_handle(&self) -> Arc { + Arc::clone(&self.ai_locked) + } + + pub fn mode(&self) -> &VideoPath { + &self.path + } + + /// Publish one decoded frame. In rtsp_forward mode this is a + /// no-op (the operator never pulls bytes through this server); + /// the call exists so `frame_ingest` can always invoke + /// `TelemetrySink::push_frame` regardless of configuration. + pub fn publish_frame(&self, frame: &Frame) { + if matches!(self.path, VideoPath::RtspForward { .. }) { + return; + } + let msg = VideoFrameMessage::from(frame); + // `broadcast::send` returns the number of receivers it + // queued for; Err means no receivers, which is fine and + // expected (no operator subscribed). + let _ = self.tx.send(msg); + self.published_frames.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn subscribe_video(&self) -> broadcast::Receiver { + self.tx.subscribe() + } + + /// Called by the gRPC `SubscribeVideo` handler when a new client + /// joins. Returns the post-join session count. The first joiner + /// (transition 0 → 1) flips `ai_locked` to `true`. + pub(crate) fn register_session(&self) -> usize { + let prev = self.video_session_count.fetch_add(1, Ordering::AcqRel); + if prev == 0 { + self.ai_locked.store(true, Ordering::Release); + } + prev + 1 + } + + /// Called by the gRPC handler (via `Drop` on the per-client + /// guard) when a client disconnects. The last leaver (transition + /// 1 → 0) flips `ai_locked` back to `false`. + pub(crate) fn deregister_session(&self) -> usize { + let prev = self.video_session_count.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + self.ai_locked.store(false, Ordering::Release); + } else if prev == 0 { + // Defensive: should never underflow because every + // deregister is paired with a register. Log loudly so we + // catch wiring mistakes early. + warn!("video_session_count underflow — register/deregister mismatch"); + self.video_session_count.store(0, Ordering::Release); + } + prev.saturating_sub(1) + } + + pub fn record_drops(&self, client_id: &str, n: u64) { + if n == 0 { + return; + } + self.bytes_inline_drops.fetch_add(n, Ordering::Relaxed); + let mut map = self.drops_per_client.lock(); + map.entry(client_id.to_string()) + .or_insert_with(|| AtomicU64::new(0)) + .fetch_add(n, Ordering::Relaxed); + } + + pub fn snapshot(&self) -> VideoSnapshot { + VideoSnapshot { + mode: self.path.mode_label(), + ai_locked: self.ai_locked.load(Ordering::Acquire), + video_session_count: self.video_session_count.load(Ordering::Acquire), + published_frames: self.published_frames.load(Ordering::Relaxed), + bytes_inline_drops_total: self.bytes_inline_drops.load(Ordering::Relaxed), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::Ordering; + + fn frame(seq: u64, ai_locked: bool) -> Frame { + Frame { + seq, + capture_ts_monotonic_ns: seq, + decode_ts_monotonic_ns: seq + 1, + pixels: Arc::new(bytes::Bytes::from(vec![0u8; 16])), + width: 4, + height: 4, + pix_fmt: PixelFormat::Nv12, + ai_locked, + } + } + + #[test] + fn rtsp_forward_publish_frame_is_a_no_op() { + // Arrange + let pubv = VideoPublisher::default_capacity(VideoPath::RtspForward { + url: "rtsp://x/y".to_string(), + }); + + // Act + pubv.publish_frame(&frame(1, false)); + pubv.publish_frame(&frame(2, false)); + + // Assert + let snap = pubv.snapshot(); + assert_eq!(snap.published_frames, 0); + assert_eq!(snap.mode, "rtsp_forward"); + } + + #[test] + fn bytes_inline_publish_frame_counts_and_fans_out() { + // Arrange + let pubv = VideoPublisher::default_capacity(VideoPath::BytesInline); + let mut rx = pubv.subscribe_video(); + + // Act + pubv.publish_frame(&frame(1, false)); + pubv.publish_frame(&frame(2, false)); + + // Assert + let snap = pubv.snapshot(); + assert_eq!(snap.published_frames, 2); + assert_eq!(snap.mode, "bytes_inline"); + assert_eq!(rx.try_recv().unwrap().seq, 1); + assert_eq!(rx.try_recv().unwrap().seq, 2); + } + + #[test] + fn register_first_session_flips_ai_locked_true() { + // Arrange + let pubv = VideoPublisher::default_capacity(VideoPath::BytesInline); + let flag = pubv.ai_locked_handle(); + assert!(!flag.load(Ordering::Acquire)); + + // Act + let n = pubv.register_session(); + + // Assert + assert_eq!(n, 1); + assert!(flag.load(Ordering::Acquire)); + assert_eq!(pubv.snapshot().video_session_count, 1); + } + + #[test] + fn deregister_last_session_flips_ai_locked_false() { + // Arrange + let pubv = VideoPublisher::default_capacity(VideoPath::BytesInline); + let flag = pubv.ai_locked_handle(); + pubv.register_session(); + pubv.register_session(); + assert!(flag.load(Ordering::Acquire)); + assert_eq!(pubv.snapshot().video_session_count, 2); + + // Act 1 — one session leaves; flag must still be true. + let after_first_leave = pubv.deregister_session(); + assert_eq!(after_first_leave, 1); + assert!( + flag.load(Ordering::Acquire), + "one session left → still locked" + ); + + // Act 2 — last session leaves; flag must flip to false. + let after_second_leave = pubv.deregister_session(); + assert_eq!(after_second_leave, 0); + assert!( + !flag.load(Ordering::Acquire), + "last session left → unlocked" + ); + } + + #[test] + fn record_drops_aggregates_and_per_client() { + // Arrange + let pubv = VideoPublisher::default_capacity(VideoPath::BytesInline); + + // Act + pubv.record_drops("op_a", 5); + pubv.record_drops("op_a", 2); + pubv.record_drops("op_b", 3); + + // Assert + assert_eq!(pubv.snapshot().bytes_inline_drops_total, 10); + } + + #[test] + fn mode_label_matches_task_spec_strings() { + // The AZ-676 task spec calls these out as the operator-facing + // mode strings; pin them as a regression guard. + assert_eq!(VideoPath::BytesInline.mode_label(), "bytes_inline"); + assert_eq!( + VideoPath::RtspForward { + url: "rtsp://x".into() + } + .mode_label(), + "rtsp_forward" + ); + } +} diff --git a/crates/telemetry_stream/src/internal/video_server.rs b/crates/telemetry_stream/src/internal/video_server.rs new file mode 100644 index 0000000..73c6c44 --- /dev/null +++ b/crates/telemetry_stream/src/internal/video_server.rs @@ -0,0 +1,167 @@ +//! AZ-676 — `SubscribeVideo` RPC handler. +//! +//! Each accepted stream: +//! 1. Registers the session (increments `video_session_count`; flips +//! `ai_locked` to `true` on the 0 → 1 transition). +//! 2. Emits exactly one `VideoSessionStart` describing the configured +//! delivery mode (`rtsp_forward { rtsp_url }` or `bytes_inline`). +//! 3. In `bytes_inline` mode, forwards `VideoFrameMessage`s from the +//! publisher's broadcast channel as `VideoFrame` proto messages. +//! Lagged broadcast → drop accounting (per AZ-676 spec; bytes_inline +//! drops_total counter on the health surface). +//! 4. On stream drop, deregisters the session (decrements counter; +//! flips `ai_locked` to `false` on the 1 → 0 transition). + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use tokio_stream::wrappers::errors::BroadcastStreamRecvError; +use tokio_stream::wrappers::BroadcastStream; +use tokio_stream::{Stream, StreamExt}; +use tonic::{Request, Response, Status}; +use tracing::{info, warn}; + +use crate::internal::proto::{ + video_message, PixelFormat as ProtoPixelFormat, SubscribeVideoRequest, VideoFrame, + VideoMessage, VideoMode, VideoSessionStart, +}; +use crate::internal::video::{VideoFrameMessage, VideoPath, VideoPublisher}; +use shared::models::frame::PixelFormat as SharedPixelFormat; + +pub type VideoStream = Pin> + Send>>; + +pub struct VideoService { + publisher: Arc, +} + +impl VideoService { + pub fn new(publisher: Arc) -> Self { + Self { publisher } + } + + pub async fn handle_subscribe( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + if req.client_id.trim().is_empty() { + return Err(Status::invalid_argument("client_id is required")); + } + let client_id = req.client_id.clone(); + + let session_n = self.publisher.register_session(); + info!(client_id = %client_id, session_n, mode = self.publisher.mode().mode_label(), "video subscribe"); + + let start_msg = match self.publisher.mode() { + VideoPath::RtspForward { url } => VideoMessage { + kind: Some(video_message::Kind::Start(VideoSessionStart { + mode: VideoMode::RtspForward as i32, + rtsp_url: url.clone(), + })), + }, + VideoPath::BytesInline => VideoMessage { + kind: Some(video_message::Kind::Start(VideoSessionStart { + mode: VideoMode::BytesInline as i32, + rtsp_url: String::new(), + })), + }, + }; + + // Build the body stream: in bytes_inline mode, forward frames + // from the broadcast. In rtsp_forward mode the body is empty + // (operator keeps the stream open just to hold the ai_locked + // session; we hand it `pending` so it sits idle until the + // client cancels). + let publisher = Arc::clone(&self.publisher); + let cid = client_id.clone(); + let body: VideoStream = match self.publisher.mode() { + VideoPath::RtspForward { .. } => Box::pin(tokio_stream::pending()), + VideoPath::BytesInline => { + let rx = self.publisher.subscribe_video(); + let mapped = BroadcastStream::new(rx).filter_map(move |item| match item { + Ok(f) => Some(Ok(VideoMessage { + kind: Some(video_message::Kind::Frame(to_proto_frame(&f))), + })), + Err(BroadcastStreamRecvError::Lagged(n)) => { + warn!(client_id = %cid, dropped = n, "video client lagged"); + publisher.record_drops(&cid, n); + None + } + }); + Box::pin(mapped) + } + }; + + let stream = StartThen { + start: Some(Ok(start_msg)), + body, + }; + + let guarded = VideoStreamGuard { + inner: stream, + publisher: Arc::clone(&self.publisher), + }; + + Ok(Response::new(Box::pin(guarded) as VideoStream)) + } +} + +fn to_proto_frame(f: &VideoFrameMessage) -> VideoFrame { + let pix = match f.pix_fmt { + SharedPixelFormat::Nv12 => ProtoPixelFormat::Nv12, + SharedPixelFormat::Yuv420p => ProtoPixelFormat::Yuv420p, + SharedPixelFormat::Rgb24 => ProtoPixelFormat::Rgb24, + }; + VideoFrame { + seq: f.seq, + monotonic_ts_ns: f.monotonic_ts_ns, + width: f.width, + height: f.height, + pix_fmt: pix as i32, + pixels: f.pixels.to_vec(), + } +} + +/// Emit `start` once, then yield everything from `body`. Cheaper than +/// `stream::once(...).chain(body)` because we avoid allocating an +/// extra adapter just for one message. +struct StartThen { + start: Option>, + body: S, +} + +impl Stream for StartThen +where + S: Stream> + Send + Unpin, +{ + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if let Some(msg) = self.start.take() { + return Poll::Ready(Some(msg)); + } + Pin::new(&mut self.body).poll_next(cx) + } +} + +/// Deregister the video session when the per-client outbound stream +/// drops. This flips `ai_locked` back to `false` on the last leaver. +struct VideoStreamGuard { + inner: S, + publisher: Arc, +} + +impl Stream for VideoStreamGuard { + type Item = S::Item; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_next(cx) + } +} + +impl Drop for VideoStreamGuard { + fn drop(&mut self) { + self.publisher.deregister_session(); + } +} diff --git a/crates/telemetry_stream/src/lib.rs b/crates/telemetry_stream/src/lib.rs index d4809f6..8cb0085 100644 --- a/crates/telemetry_stream/src/lib.rs +++ b/crates/telemetry_stream/src/lib.rs @@ -1,18 +1,22 @@ //! `telemetry_stream` — always-on uplink to the Ground Station + operator-command downlink. //! //! Real implementations: -//! - **AZ-675 (this crate, this batch)**: Tonic gRPC server, per-client -//! bounded queue, drop-oldest back-pressure, drop counters. Topics: +//! - **AZ-675**: Tonic gRPC server, per-client bounded queue, +//! drop-oldest back-pressure, drop counters. Topics: //! `TelemetrySample`, `GimbalState`, `DetectionEvent`, //! `MovementCandidate`, `MapObjectsBundle`. -//! - **AZ-676**: video frame topic (separate RPC, server-streamed -//! binary payloads). -//! - **AZ-677**: diff-based snapshot emission for `MapObjectsBundle`. +//! - **AZ-676** (this crate, this batch): operator video path — two +//! modes (`RtspForward { url }`, `BytesInline`) plus shared +//! `ai_locked` atomic flipped by SubscribeVideo session counter. +//! - **AZ-677** (this crate, this batch): MapObjectsBundle snapshot +//! on subscribe + diff stream while connected + fresh snapshot on +//! reconnect (no diff replay). //! - **AZ-678+**: command-auth on the return path (operator_bridge). pub mod internal; use std::net::SocketAddr; +use std::sync::atomic::AtomicBool; use std::sync::Arc; use async_trait::async_trait; @@ -26,19 +30,28 @@ use shared::health::{ComponentHealth, HealthLevel}; use shared::models::detection::DetectionBatch; use shared::models::frame::Frame; use shared::models::operator::OperatorCommand; +use shared::models::operator_event::OperatorEvent; +use crate::internal::mapobjects::{MapObjectsDiff, SharedSnapshotSource}; use crate::internal::proto::telemetry_stream_server::TelemetryStreamServer; use crate::internal::proto::Topic; use crate::internal::publisher::{TelemetryPublisher, DEFAULT_TOPIC_CAPACITY}; use crate::internal::server::TelemetryService; +use crate::internal::video::{VideoPath, VideoPublisher, DEFAULT_VIDEO_CAPACITY}; +pub use crate::internal::mapobjects::{ + EmptyMapObjectsSource, MapObjectsBundleSnapshot, MapObjectsSnapshotSource, + MapObjectsTopicMessage, +}; pub use crate::internal::proto::{ - telemetry_stream_client::TelemetryStreamClient, SubscribeRequest, TelemetryMessage, - Topic as TelemetryTopic, + telemetry_stream_client::TelemetryStreamClient, video_message, SubscribeRequest, + SubscribeVideoRequest, TelemetryMessage, Topic as TelemetryTopic, VideoFrame, VideoMessage, + VideoMode, VideoSessionStart, }; pub use crate::internal::publisher::{ PerTopicCounters, PublishError, PublisherSnapshot, ALL_TOPICS, }; +pub use crate::internal::video::{VideoSnapshot, DEFAULT_VIDEO_CAPACITY as VIDEO_DEFAULT_CAPACITY}; const NAME: &str = "telemetry_stream"; @@ -56,6 +69,10 @@ pub struct TelemetryStreamConfig { /// Bounded capacity of the downlink command channel that feeds /// `operator_bridge`. pub downlink_capacity: usize, + /// AZ-676 — video delivery mode + per-client video broadcast + /// capacity. + pub video_path: VideoPath, + pub video_capacity: usize, } impl Default for TelemetryStreamConfig { @@ -64,12 +81,15 @@ impl Default for TelemetryStreamConfig { listen_addr: "0.0.0.0:50061".parse().expect("hardcoded addr parses"), topic_capacity: DEFAULT_TOPIC_CAPACITY, downlink_capacity: 64, + video_path: VideoPath::default(), + video_capacity: DEFAULT_VIDEO_CAPACITY, } } } pub struct TelemetryStream { publisher: Arc, + video: Arc, commands_tx: mpsc::Sender, commands_rx: Option>, config: TelemetryStreamConfig, @@ -85,9 +105,11 @@ impl TelemetryStream { pub fn with_config(config: TelemetryStreamConfig) -> Self { let publisher = TelemetryPublisher::new(config.topic_capacity); + let video = VideoPublisher::new(config.video_path.clone(), config.video_capacity); let (commands_tx, commands_rx) = mpsc::channel(config.downlink_capacity); Self { publisher, + video, commands_tx, commands_rx: Some(commands_rx), config, @@ -97,10 +119,25 @@ impl TelemetryStream { pub fn handle(&self) -> TelemetryStreamHandle { TelemetryStreamHandle { publisher: Arc::clone(&self.publisher), + video: Arc::clone(&self.video), commands_tx: self.commands_tx.clone(), } } + /// AZ-676 — handle on the shared `ai_locked` atomic. + /// `frame_ingest` and `detection_client` read this at decode and + /// inference time. The composition root must call this and feed + /// the result into their constructors. + pub fn ai_locked_handle(&self) -> Arc { + self.video.ai_locked_handle() + } + + /// AZ-677 — wire the snapshot source. The composition root passes + /// an adapter over `mapobjects_store::MapObjectsStore::snapshot()`. + pub fn set_mapobjects_snapshot_source(&self, src: SharedSnapshotSource) { + self.publisher.set_snapshot_source(src); + } + /// Take the downlink command receiver. The composition root /// forwards it to `operator_bridge` as `Receiver`. pub fn take_command_receiver(&mut self) -> Option> { @@ -118,9 +155,10 @@ impl TelemetryStream { )> { let listen_addr = self.config.listen_addr; let publisher = Arc::clone(&self.publisher); + let video = Arc::clone(&self.video); let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); - let svc = TelemetryStreamServer::new(TelemetryService::new(publisher)); + let svc = TelemetryStreamServer::new(TelemetryService::new(publisher, video)); let join = tokio::spawn(async move { Server::builder() .add_service(svc) @@ -156,8 +194,9 @@ impl TelemetryStream { let stream = tokio_stream::wrappers::TcpListenerStream::new(tokio_listener); let publisher = Arc::clone(&self.publisher); + let video = Arc::clone(&self.video); let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); - let svc = TelemetryStreamServer::new(TelemetryService::new(publisher)); + let svc = TelemetryStreamServer::new(TelemetryService::new(publisher, video)); let join = tokio::spawn(async move { Server::builder() @@ -202,6 +241,7 @@ impl Drop for GrpcShutdown { #[derive(Clone)] pub struct TelemetryStreamHandle { publisher: Arc, + video: Arc, commands_tx: mpsc::Sender, } @@ -216,6 +256,16 @@ impl TelemetryStreamHandle { self.publisher.publish(topic, payload) } + /// AZ-677 — broadcast a MapObjectsDiff to operators subscribed to + /// the MapObjectsBundle topic. Fed by the composition root that + /// owns the `mapobjects_store` append stream. + pub fn push_mapobjects_diff( + &self, + diff: MapObjectsDiff, + ) -> std::result::Result<(), PublishError> { + self.publisher.publish_mapobjects_diff(diff) + } + /// Inject an operator command downlink. Production path is fed /// by the gRPC return half once AZ-678 lands; tests may call this /// directly. @@ -230,8 +280,14 @@ impl TelemetryStreamHandle { self.publisher.snapshot() } + pub fn video_snapshot(&self) -> VideoSnapshot { + self.video.snapshot() + } + pub fn health(&self) -> ComponentHealth { let snap = self.publisher.snapshot(); + let vsnap = self.video.snapshot(); + let (resnap, diff_count, snap_bytes) = self.publisher.mapobjects_counters(); let mut h = ComponentHealth::green(NAME); let hot_drops: Vec<_> = snap @@ -241,10 +297,20 @@ impl TelemetryStreamHandle { .collect(); let detail = format!( - "subscribers={} published_total={} hot_drop_pairs={}", + "subscribers={} published_total={} hot_drop_pairs={} \ + video_path={} ai_locked={} video_sessions={} \ + bytes_inline_drops={} mapobjects_snapshot_bytes={} \ + mapobjects_diff_count={} mapobjects_resnap_count={}", snap.subscribed_clients, snap.published_total, - hot_drops.len() + hot_drops.len(), + vsnap.mode, + vsnap.ai_locked, + vsnap.video_session_count, + vsnap.bytes_inline_drops_total, + snap_bytes, + diff_count, + resnap, ); if !hot_drops.is_empty() { @@ -257,10 +323,13 @@ impl TelemetryStreamHandle { #[async_trait] impl TelemetrySink for TelemetryStreamHandle { - async fn push_frame(&self, _frame: Frame) -> Result<()> { - Err(AutopilotError::NotImplemented( - "telemetry_stream::push_frame (AZ-676 video path)", - )) + async fn push_frame(&self, frame: Frame) -> Result<()> { + // AZ-676 — bytes_inline path. In rtsp_forward mode the + // publisher returns early; the call is intentionally + // infallible so frame_ingest can always push without + // branching on configuration. + self.video.publish_frame(&frame); + Ok(()) } async fn push_detections(&self, batch: DetectionBatch) -> Result<()> { @@ -268,11 +337,20 @@ impl TelemetrySink for TelemetryStreamHandle { .publish(Topic::DetectionEvent, &batch) .map_err(|e| AutopilotError::Internal(format!("publish detections: {e}"))) } + + async fn push_operator_event(&self, event: OperatorEvent) -> Result<()> { + // AZ-679 — serialised onto Topic::OperatorEvent. JSON payload + // is the tagged enum (`kind: poi_surfaced | poi_dequeued`). + self.publisher + .publish(Topic::OperatorEvent, &event) + .map_err(|e| AutopilotError::Internal(format!("publish operator event: {e}"))) + } } #[cfg(test)] mod tests { use super::*; + use std::sync::atomic::Ordering; #[test] fn handle_starts_with_zero_subscribers_and_green_health() { @@ -306,4 +384,86 @@ mod tests { // Assert assert_eq!(h.snapshot().per_topic[&Topic::TelemetrySample].published, 1); } + + #[test] + fn ai_locked_handle_starts_false() { + // Arrange + let s = TelemetryStream::new(8); + + // Act + let flag = s.ai_locked_handle(); + + // Assert + assert!(!flag.load(Ordering::Acquire)); + assert!(!s.handle().video_snapshot().ai_locked); + } + + #[test] + fn push_frame_bytes_inline_counts_in_video_snapshot() { + // Arrange + let cfg = TelemetryStreamConfig { + video_path: VideoPath::BytesInline, + ..TelemetryStreamConfig::default() + }; + let s = TelemetryStream::with_config(cfg); + let h = s.handle(); + let f = Frame { + seq: 1, + capture_ts_monotonic_ns: 1, + decode_ts_monotonic_ns: 2, + pixels: Arc::new(bytes::Bytes::from(vec![0u8; 32])), + width: 4, + height: 4, + pix_fmt: shared::models::frame::PixelFormat::Nv12, + ai_locked: false, + }; + + // Act + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + rt.block_on(async { + h.push_frame(f).await.unwrap(); + }); + + // Assert + assert_eq!(h.video_snapshot().published_frames, 1); + } + + #[test] + fn push_frame_rtsp_forward_does_not_count() { + // Arrange + let cfg = TelemetryStreamConfig { + video_path: VideoPath::RtspForward { + url: "rtsp://x".to_string(), + }, + ..TelemetryStreamConfig::default() + }; + let s = TelemetryStream::with_config(cfg); + let h = s.handle(); + let f = Frame { + seq: 1, + capture_ts_monotonic_ns: 1, + decode_ts_monotonic_ns: 2, + pixels: Arc::new(bytes::Bytes::from(vec![0u8; 32])), + width: 4, + height: 4, + pix_fmt: shared::models::frame::PixelFormat::Nv12, + ai_locked: false, + }; + + // Act + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + rt.block_on(async { + h.push_frame(f).await.unwrap(); + }); + + // Assert + assert_eq!(h.video_snapshot().published_frames, 0); + assert_eq!(h.video_snapshot().mode, "rtsp_forward"); + } } diff --git a/crates/telemetry_stream/tests/mapobjects_snapshot.rs b/crates/telemetry_stream/tests/mapobjects_snapshot.rs new file mode 100644 index 0000000..67d90b4 --- /dev/null +++ b/crates/telemetry_stream/tests/mapobjects_snapshot.rs @@ -0,0 +1,380 @@ +//! AZ-677 integration tests — snapshot on subscribe, diff stream while +//! connected, fresh snapshot on reconnect (no diff replay). + +use std::net::TcpListener; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use chrono::Utc; +use tokio::time::timeout; +use tokio_stream::StreamExt; +use tonic::transport::{Channel, Endpoint}; +use tonic::Request; +use uuid::Uuid; + +use shared::models::mapobject::{ + BundleFreshness, DiffKind, IgnoredItem, IgnoredItemSource, MapObject, MapObjectObservation, + MapObjectSource, MapObjectsBundle, RetentionScope, +}; +use shared::models::mission::Coordinate; + +use telemetry_stream::internal::mapobjects::MapObjectsDiff; +use telemetry_stream::{ + MapObjectsSnapshotSource, MapObjectsTopicMessage, SubscribeRequest, TelemetryStream, + TelemetryStreamClient, TelemetryTopic, +}; + +fn bind_ephemeral() -> (TcpListener, u16) { + let l = TcpListener::bind("127.0.0.1:0").expect("bind ephemeral"); + let port = l.local_addr().unwrap().port(); + (l, port) +} + +async fn connect(port: u16) -> TelemetryStreamClient { + let url = format!("http://127.0.0.1:{port}"); + let endpoint = Endpoint::from_shared(url) + .unwrap() + .connect_timeout(Duration::from_secs(2)); + for _ in 0..50 { + if let Ok(c) = TelemetryStreamClient::connect(endpoint.clone()).await { + return c; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + panic!("gRPC client failed to connect"); +} + +fn coord(lat: f64, lon: f64) -> Coordinate { + Coordinate { + latitude: lat, + longitude: lon, + altitude_m: 0.0, + } +} + +fn make_mapobject(class: &str) -> MapObject { + let now = Utc::now(); + MapObject { + h3_cell: 0, + mgrs_key: "33UWP00".to_string(), + class: class.to_string(), + class_group: "vehicle".to_string(), + gps_lat: 0.0, + gps_lon: 0.0, + size_width_m: 2.0, + size_length_m: 4.0, + confidence: 0.8, + first_seen: now, + last_seen: now, + mission_id: "m1".to_string(), + source: MapObjectSource::LocalObserved, + pending_upload: true, + } +} + +fn make_ignored() -> IgnoredItem { + IgnoredItem { + id: Uuid::new_v4(), + mgrs: "33UWP01".to_string(), + h3_cell: 0, + class_group: "vehicle".to_string(), + decline_time: Utc::now(), + operator_id: None, + mission_id: "m1".to_string(), + retention_scope: RetentionScope::Mission, + expires_at: None, + source: IgnoredItemSource::LocalAppended, + pending_upload: true, + } +} + +fn make_observation(class: &str) -> MapObjectObservation { + MapObjectObservation { + id: Uuid::new_v4(), + h3_cell: 0, + class: class.to_string(), + class_group: "vehicle".to_string(), + mission_id: "m1".to_string(), + uav_id: "uav_1".to_string(), + observed_at_monotonic_ns: 1, + observed_at_wallclock: Utc::now(), + gps_lat: 0.0, + gps_lon: 0.0, + mgrs: "33UWP02".to_string(), + size_width_m: 2.0, + size_length_m: 4.0, + confidence: 0.7, + diff_kind: DiffKind::New, + photo_ref: None, + raw_evidence: None, + } +} + +/// Snapshot source whose `snapshot()` content can be mutated between +/// subscribes. Tracks how many times it has been called so the +/// reconnect test can verify the source was re-queried (rather than a +/// cached snapshot). +struct MutableSource { + bundle: parking_lot::Mutex, + snapshots_emitted: AtomicUsize, +} + +impl MutableSource { + fn new(initial: MapObjectsBundle) -> Self { + Self { + bundle: parking_lot::Mutex::new(initial), + snapshots_emitted: AtomicUsize::new(0), + } + } + + fn set(&self, b: MapObjectsBundle) { + *self.bundle.lock() = b; + } +} + +impl MapObjectsSnapshotSource for MutableSource { + fn snapshot(&self) -> MapObjectsBundle { + self.snapshots_emitted.fetch_add(1, Ordering::Relaxed); + self.bundle.lock().clone() + } +} + +fn empty_bundle() -> MapObjectsBundle { + MapObjectsBundle { + schema_version: "1.0".to_string(), + mission_id: "m1".to_string(), + bbox: [coord(0.0, 0.0), coord(1.0, 1.0)], + map_objects: Vec::new(), + observations: Vec::new(), + ignored_items: Vec::new(), + as_of: Utc::now(), + freshness: Some(BundleFreshness::Fresh), + } +} + +/// AC-1 — A fresh subscriber to MapObjectsBundle receives exactly one +/// snapshot, populated from the configured source. +#[tokio::test] +async fn ac1_first_subscribe_receives_snapshot() { + // Arrange + let (listener, port) = bind_ephemeral(); + let server = TelemetryStream::new(64); + let handle = server.handle(); + let initial = MapObjectsBundle { + map_objects: (0..50).map(|_| make_mapobject("tank")).collect(), + ignored_items: (0..10).map(|_| make_ignored()).collect(), + ..empty_bundle() + }; + let src = Arc::new(MutableSource::new(initial)); + server.set_mapobjects_snapshot_source(src.clone()); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + let mut client = connect(port).await; + let mut stream = client + .subscribe(Request::new(SubscribeRequest { + client_id: "op_a".to_string(), + topics: vec![TelemetryTopic::MapObjectsBundle as i32], + })) + .await + .unwrap() + .into_inner(); + + // Act — pull the first message off the stream. + let msg = timeout(Duration::from_secs(2), stream.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + let payload: MapObjectsTopicMessage = serde_json::from_slice(&msg.payload_json).unwrap(); + + // Assert + let snap = match payload { + MapObjectsTopicMessage::Snapshot(s) => s, + MapObjectsTopicMessage::Diff(_) => panic!("expected Snapshot first; got Diff"), + }; + assert_eq!(snap.bundle.map_objects.len(), 50); + assert_eq!(snap.bundle.ignored_items.len(), 10); + assert_eq!(src.snapshots_emitted.load(Ordering::Relaxed), 1); + // Drop client + handle scope (cleanup). + drop(stream); + drop(client); + drop(handle); +} + +/// AC-2 — While connected, diffs appended by the composition root are +/// received by the client. +#[tokio::test] +async fn ac2_inflight_changes_emit_diffs() { + // Arrange + let (listener, port) = bind_ephemeral(); + let server = TelemetryStream::new(64); + let handle = server.handle(); + let src = Arc::new(MutableSource::new(empty_bundle())); + server.set_mapobjects_snapshot_source(src); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + let mut client = connect(port).await; + let mut stream = client + .subscribe(Request::new(SubscribeRequest { + client_id: "op_b".to_string(), + topics: vec![TelemetryTopic::MapObjectsBundle as i32], + })) + .await + .unwrap() + .into_inner(); + + // Drain the snapshot first. + let snap_msg = timeout(Duration::from_secs(2), stream.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + let snap_payload: MapObjectsTopicMessage = + serde_json::from_slice(&snap_msg.payload_json).unwrap(); + assert!(matches!(snap_payload, MapObjectsTopicMessage::Snapshot(_))); + + // Wait for the client to register before publishing diffs. + for _ in 0..50 { + if handle.snapshot().subscribed_clients == 1 { + break; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + + // Act — push one diff with 3 added observations + 1 ignored item. + let diff = MapObjectsDiff { + added: vec![ + make_observation("tank"), + make_observation("apc"), + make_observation("truck"), + ], + moved: vec![], + removed_candidates: vec![], + ignored: vec![make_ignored()], + }; + handle.push_mapobjects_diff(diff).unwrap(); + + // Pull the next message — must be a Diff carrying our content. + let diff_msg = timeout(Duration::from_secs(2), stream.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + let diff_payload: MapObjectsTopicMessage = + serde_json::from_slice(&diff_msg.payload_json).unwrap(); + let received_diff = match diff_payload { + MapObjectsTopicMessage::Diff(d) => d, + MapObjectsTopicMessage::Snapshot(_) => panic!("expected Diff; got Snapshot"), + }; + + // Assert + assert_eq!(received_diff.added.len(), 3); + assert_eq!(received_diff.ignored.len(), 1); + assert!(received_diff.moved.is_empty()); + assert!(received_diff.removed_candidates.is_empty()); +} + +/// AC-3 — Reconnect after disconnect emits a fresh snapshot reflecting +/// current state; diffs that flew during the gap are NOT replayed. +#[tokio::test] +async fn ac3_reconnect_resnaps_without_replay() { + // Arrange + let (listener, port) = bind_ephemeral(); + let server = TelemetryStream::new(64); + let handle = server.handle(); + let src = Arc::new(MutableSource::new(empty_bundle())); + server.set_mapobjects_snapshot_source(src.clone()); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + // Subscribe once, drain snapshot, drop. + let mut client = connect(port).await; + let mut stream = client + .subscribe(Request::new(SubscribeRequest { + client_id: "op_c".to_string(), + topics: vec![TelemetryTopic::MapObjectsBundle as i32], + })) + .await + .unwrap() + .into_inner(); + let first_snap = timeout(Duration::from_secs(2), stream.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + let first: MapObjectsTopicMessage = serde_json::from_slice(&first_snap.payload_json).unwrap(); + let first_snap = match first { + MapObjectsTopicMessage::Snapshot(s) => s, + MapObjectsTopicMessage::Diff(_) => panic!("first must be Snapshot"), + }; + assert!(first_snap.bundle.map_objects.is_empty()); + + // Disconnect. + drop(stream); + drop(client); + for _ in 0..50 { + if handle.snapshot().subscribed_clients == 0 { + break; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + + // Act — store grew by 5 while client was disconnected. Also push + // a couple of diffs that the reconnecting client must NOT see. + src.set(MapObjectsBundle { + map_objects: (0..5).map(|_| make_mapobject("tank")).collect(), + ..empty_bundle() + }); + handle + .push_mapobjects_diff(MapObjectsDiff { + added: vec![make_observation("ghost_during_gap")], + ..MapObjectsDiff::default() + }) + .unwrap(); + + // Reconnect with the same client_id. + let mut client2 = connect(port).await; + let mut stream2 = client2 + .subscribe(Request::new(SubscribeRequest { + client_id: "op_c".to_string(), + topics: vec![TelemetryTopic::MapObjectsBundle as i32], + })) + .await + .unwrap() + .into_inner(); + let resnap_msg = timeout(Duration::from_secs(2), stream2.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + let resnap_payload: MapObjectsTopicMessage = + serde_json::from_slice(&resnap_msg.payload_json).unwrap(); + + // Assert — first message after reconnect is a snapshot reflecting + // the new bundle. The skipped-during-gap diff is NOT in the + // stream (we read with a short timeout to prove no replay). + let resnap = match resnap_payload { + MapObjectsTopicMessage::Snapshot(s) => s, + MapObjectsTopicMessage::Diff(_) => { + panic!("first message after reconnect MUST be Snapshot, not Diff"); + } + }; + assert_eq!( + resnap.bundle.map_objects.len(), + 5, + "snapshot must reflect post-gap store" + ); + assert!( + src.snapshots_emitted.load(Ordering::Relaxed) >= 2, + "snapshot source must have been re-queried on reconnect" + ); + + // The reconnected stream should NOT immediately deliver another + // message — the gap diff was broadcast before reconnect and a + // late subscriber MUST not see it. + let maybe_extra = timeout(Duration::from_millis(300), stream2.next()).await; + assert!( + maybe_extra.is_err(), + "reconnect MUST NOT replay gap diff (got unexpected message)" + ); +} diff --git a/crates/telemetry_stream/tests/video_path.rs b/crates/telemetry_stream/tests/video_path.rs new file mode 100644 index 0000000..a9608d2 --- /dev/null +++ b/crates/telemetry_stream/tests/video_path.rs @@ -0,0 +1,307 @@ +//! AZ-676 integration tests — SubscribeVideo RPC, ai_locked atomic +//! coordination, and per-mode delivery semantics. + +use std::net::TcpListener; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::time::Duration; + +use bytes::Bytes; +use tokio::time::timeout; +use tokio_stream::StreamExt; +use tonic::transport::{Channel, Endpoint}; +use tonic::Request; + +use shared::contracts::TelemetrySink; +use shared::models::frame::{Frame, PixelFormat as SharedPixelFormat}; +use telemetry_stream::internal::video::VideoPath; +use telemetry_stream::{ + video_message, SubscribeVideoRequest, TelemetryStream, TelemetryStreamClient, + TelemetryStreamConfig, VideoMode, +}; + +fn bind_ephemeral() -> (TcpListener, u16) { + let l = TcpListener::bind("127.0.0.1:0").expect("bind ephemeral"); + let port = l.local_addr().unwrap().port(); + (l, port) +} + +async fn connect(port: u16) -> TelemetryStreamClient { + let url = format!("http://127.0.0.1:{port}"); + let endpoint = Endpoint::from_shared(url) + .unwrap() + .connect_timeout(Duration::from_secs(2)); + for _ in 0..50 { + if let Ok(c) = TelemetryStreamClient::connect(endpoint.clone()).await { + return c; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + panic!("gRPC client failed to connect"); +} + +fn make_frame(seq: u64, payload_len: usize) -> Frame { + Frame { + seq, + capture_ts_monotonic_ns: seq * 1_000_000, + decode_ts_monotonic_ns: seq * 1_000_000 + 10_000, + pixels: Arc::new(Bytes::from(vec![(seq & 0xff) as u8; payload_len])), + width: 1920, + height: 1080, + pix_fmt: SharedPixelFormat::Nv12, + ai_locked: false, + } +} + +/// AC-1 — rtsp_forward emits exactly the configured URL in the +/// session-start message; no frames flow. +#[tokio::test] +async fn ac1_rtsp_forward_emits_url_only() { + // Arrange + let (listener, port) = bind_ephemeral(); + let cfg = TelemetryStreamConfig { + video_path: VideoPath::RtspForward { + url: "rtsp://camera.local:8554/stream0".to_string(), + }, + ..TelemetryStreamConfig::default() + }; + let server = TelemetryStream::with_config(cfg); + let handle = server.handle(); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + let mut client = connect(port).await; + + // Act + let mut stream = client + .subscribe_video(Request::new(SubscribeVideoRequest { + client_id: "op_1".to_string(), + })) + .await + .unwrap() + .into_inner(); + + let first = timeout(Duration::from_secs(2), stream.next()) + .await + .expect("session-start within 2s") + .expect("stream open") + .expect("ok status"); + + // Push a frame anyway — in rtsp_forward mode it must NOT flow. + handle.push_frame(make_frame(1, 1024)).await.unwrap(); + + let second = timeout(Duration::from_millis(500), stream.next()).await; + + // Assert + let kind = first.kind.unwrap(); + match kind { + video_message::Kind::Start(start) => { + assert_eq!(start.mode, VideoMode::RtspForward as i32); + assert_eq!(start.rtsp_url, "rtsp://camera.local:8554/stream0"); + } + other => panic!("expected Start, got {other:?}"), + } + assert!( + second.is_err(), + "no further messages expected in rtsp_forward mode" + ); + assert_eq!(handle.video_snapshot().mode, "rtsp_forward"); +} + +/// AC-2 — bytes_inline forwards encoded frames to subscribed clients. +#[tokio::test] +async fn ac2_bytes_inline_forwards_frames() { + // Arrange + let (listener, port) = bind_ephemeral(); + let cfg = TelemetryStreamConfig { + video_path: VideoPath::BytesInline, + // Generous capacity so the test client keeps up without lag. + video_capacity: 256, + ..TelemetryStreamConfig::default() + }; + let server = TelemetryStream::with_config(cfg); + let handle = server.handle(); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + let mut client = connect(port).await; + let mut stream = client + .subscribe_video(Request::new(SubscribeVideoRequest { + client_id: "op_inline".to_string(), + })) + .await + .unwrap() + .into_inner(); + + // Drain the session-start first. + let start = timeout(Duration::from_secs(2), stream.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + assert!(matches!(start.kind.unwrap(), video_message::Kind::Start(_))); + + // Wait until the server has registered the session before + // publishing so no frames are emitted before the broadcast has a + // receiver. + for _ in 0..100 { + if handle.video_snapshot().video_session_count == 1 { + break; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + assert_eq!(handle.video_snapshot().video_session_count, 1); + + // Act — publish 100 frames; verify the client gets each one in + // monotonically increasing sequence. + let total: u64 = 100; + for seq in 0..total { + // Tiny pixel payload so the test isn't expensive. + handle.push_frame(make_frame(seq, 64)).await.unwrap(); + } + + let mut received = 0u64; + let mut last_seq: Option = None; + while received < total { + let msg = timeout(Duration::from_secs(2), stream.next()) + .await + .expect("ac2 stalled — frame not received in 2s") + .expect("stream open") + .expect("ok status"); + match msg.kind.unwrap() { + video_message::Kind::Frame(f) => { + if let Some(prev) = last_seq { + assert!(f.seq > prev, "monotonic seq violated: {prev} → {}", f.seq); + } + last_seq = Some(f.seq); + received += 1; + } + video_message::Kind::Start(_) => panic!("unexpected second Start"), + } + } + + // Assert + assert_eq!(received, total); + let snap = handle.video_snapshot(); + assert_eq!(snap.published_frames, total); + assert_eq!(snap.bytes_inline_drops_total, 0); +} + +/// AC-3 — ai_locked flips true on first subscriber, false when the +/// last subscriber disconnects. +#[tokio::test] +async fn ac3_ai_locked_toggles_on_session_start_and_stop() { + // Arrange + let (listener, port) = bind_ephemeral(); + let cfg = TelemetryStreamConfig { + video_path: VideoPath::BytesInline, + ..TelemetryStreamConfig::default() + }; + let server = TelemetryStream::with_config(cfg); + let handle = server.handle(); + let ai_locked = server.ai_locked_handle(); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + // No clients yet → false. + assert!(!ai_locked.load(Ordering::Acquire)); + + // Act 1 — first subscriber connects; flag must flip to true. + let mut c1 = connect(port).await; + let mut s1 = c1 + .subscribe_video(Request::new(SubscribeVideoRequest { + client_id: "op_a".to_string(), + })) + .await + .unwrap() + .into_inner(); + let _start = timeout(Duration::from_secs(2), s1.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + for _ in 0..100 { + if ai_locked.load(Ordering::Acquire) { + break; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + assert!( + ai_locked.load(Ordering::Acquire), + "ai_locked MUST be true once first session is active" + ); + + // Act 2 — second subscriber connects; flag stays true. + let mut c2 = connect(port).await; + let mut s2 = c2 + .subscribe_video(Request::new(SubscribeVideoRequest { + client_id: "op_b".to_string(), + })) + .await + .unwrap() + .into_inner(); + let _start = timeout(Duration::from_secs(2), s2.next()) + .await + .unwrap() + .unwrap() + .unwrap(); + for _ in 0..100 { + if handle.video_snapshot().video_session_count == 2 { + break; + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + assert!(ai_locked.load(Ordering::Acquire)); + assert_eq!(handle.video_snapshot().video_session_count, 2); + + // Act 3 — drop second client; one session left, still locked. + drop(s2); + drop(c2); + for _ in 0..100 { + if handle.video_snapshot().video_session_count == 1 { + break; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + assert_eq!(handle.video_snapshot().video_session_count, 1); + assert!(ai_locked.load(Ordering::Acquire)); + + // Act 4 — drop last client; ai_locked flips to false. + drop(s1); + drop(c1); + for _ in 0..100 { + if !ai_locked.load(Ordering::Acquire) { + break; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + assert!( + !ai_locked.load(Ordering::Acquire), + "ai_locked MUST be false after last session leaves" + ); + assert_eq!(handle.video_snapshot().video_session_count, 0); +} + +/// Empty client_id is rejected at the boundary (parity with Subscribe). +#[tokio::test] +async fn empty_client_id_rejected() { + // Arrange + let (listener, port) = bind_ephemeral(); + let cfg = TelemetryStreamConfig { + video_path: VideoPath::BytesInline, + ..TelemetryStreamConfig::default() + }; + let server = TelemetryStream::with_config(cfg); + let _h = server.handle(); + let (_join, _guard) = server.spawn_grpc_server_on(listener).unwrap(); + + let mut client = connect(port).await; + + // Act + let err = client + .subscribe_video(Request::new(SubscribeVideoRequest { + client_id: String::new(), + })) + .await + .expect_err("empty client_id must error"); + + // Assert + assert_eq!(err.code(), tonic::Code::InvalidArgument); +}