From 64542d32fccf6d22a59099d74e971e8903de0b2c Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Sun, 10 May 2026 00:21:34 +0300 Subject: [PATCH] Update autodev state, architecture documentation, and glossary terms Transitioned the autodev state to phase 21, reflecting the completion of Step 5 and the drafting of Step 6 epics. Revised the architecture documentation to clarify the roles of the Tile Manager and its components, ensuring accurate representation of the system's operational flow. Updated glossary entries for Flight State and Operator to incorporate recent changes and enhance clarity on component interactions and responsibilities. --- .cursor/rules/large-file-writes.mdc | 41 + _docs/02_document/architecture.md | 82 +- .../01_helper_imu_preintegrator.md | 32 + .../common-helpers/02_helper_se3_utils.md | 30 + .../03_helper_lightglue_runtime.md | 30 + .../common-helpers/04_helper_wgs_converter.md | 43 + .../05_helper_sha256_sidecar.md | 36 + .../06_helper_engine_filename_schema.md | 50 + .../common-helpers/07_helper_ransac_filter.md | 49 + .../08_helper_descriptor_normaliser.md | 33 + .../components/01_c1_vio/description.md | 136 ++ .../02_document/components/01_c1_vio/tests.md | 176 ++ .../components/02_c2_vpr/description.md | 138 ++ .../02_document/components/02_c2_vpr/tests.md | 144 ++ .../components/03_c2_5_rerank/description.md | 111 + .../components/03_c2_5_rerank/tests.md | 107 + .../components/04_c3_matcher/description.md | 127 + .../components/04_c3_matcher/tests.md | 144 ++ .../components/05_c3_5_adhop/description.md | 107 + .../components/05_c3_5_adhop/tests.md | 104 + .../components/06_c4_pose/description.md | 115 + .../components/06_c4_pose/tests.md | 123 + .../components/07_c5_state/description.md | 134 ++ .../components/07_c5_state/tests.md | 189 ++ .../08_c6_tile_cache/description.md | 172 ++ .../components/08_c6_tile_cache/tests.md | 168 ++ .../components/09_c7_inference/description.md | 155 ++ .../components/09_c7_inference/tests.md | 170 ++ .../10_c8_fc_adapter/description.md | 153 ++ .../components/10_c8_fc_adapter/tests.md | 224 ++ .../11_c10_provisioning/description.md | 151 ++ .../components/11_c10_provisioning/tests.md | 151 ++ .../12_c11_tilemanager/description.md | 201 ++ .../components/12_c11_tilemanager/tests.md | 213 ++ .../13_c12_operator_tooling/description.md | 145 ++ .../13_c12_operator_tooling/tests.md | 143 ++ .../components/14_c13_fdr/description.md | 136 ++ .../components/14_c13_fdr/tests.md | 166 ++ _docs/02_document/data_model.md | 571 +++++ .../02_document/deployment/ci_cd_pipeline.md | 194 ++ .../deployment/containerization.md | 245 ++ .../deployment/deployment_procedures.md | 265 +++ .../deployment/environment_strategy.md | 178 ++ _docs/02_document/deployment/observability.md | 232 ++ _docs/02_document/diagrams/components.drawio | 221 ++ _docs/02_document/diagrams/flows/00_index.md | 34 + _docs/02_document/epics.md | 2054 +++++++++++++++++ _docs/02_document/glossary.md | 6 +- _docs/02_document/risk_mitigations.md | 141 ++ _docs/02_document/system-flows.md | 99 +- .../02_document/tests/traceability-matrix.md | 2 +- _docs/_autodev_state.md | 6 +- 52 files changed, 8789 insertions(+), 88 deletions(-) create mode 100644 .cursor/rules/large-file-writes.mdc create mode 100644 _docs/02_document/common-helpers/01_helper_imu_preintegrator.md create mode 100644 _docs/02_document/common-helpers/02_helper_se3_utils.md create mode 100644 _docs/02_document/common-helpers/03_helper_lightglue_runtime.md create mode 100644 _docs/02_document/common-helpers/04_helper_wgs_converter.md create mode 100644 _docs/02_document/common-helpers/05_helper_sha256_sidecar.md create mode 100644 _docs/02_document/common-helpers/06_helper_engine_filename_schema.md create mode 100644 _docs/02_document/common-helpers/07_helper_ransac_filter.md create mode 100644 _docs/02_document/common-helpers/08_helper_descriptor_normaliser.md create mode 100644 _docs/02_document/components/01_c1_vio/description.md create mode 100644 _docs/02_document/components/01_c1_vio/tests.md create mode 100644 _docs/02_document/components/02_c2_vpr/description.md create mode 100644 _docs/02_document/components/02_c2_vpr/tests.md create mode 100644 _docs/02_document/components/03_c2_5_rerank/description.md create mode 100644 _docs/02_document/components/03_c2_5_rerank/tests.md create mode 100644 _docs/02_document/components/04_c3_matcher/description.md create mode 100644 _docs/02_document/components/04_c3_matcher/tests.md create mode 100644 _docs/02_document/components/05_c3_5_adhop/description.md create mode 100644 _docs/02_document/components/05_c3_5_adhop/tests.md create mode 100644 _docs/02_document/components/06_c4_pose/description.md create mode 100644 _docs/02_document/components/06_c4_pose/tests.md create mode 100644 _docs/02_document/components/07_c5_state/description.md create mode 100644 _docs/02_document/components/07_c5_state/tests.md create mode 100644 _docs/02_document/components/08_c6_tile_cache/description.md create mode 100644 _docs/02_document/components/08_c6_tile_cache/tests.md create mode 100644 _docs/02_document/components/09_c7_inference/description.md create mode 100644 _docs/02_document/components/09_c7_inference/tests.md create mode 100644 _docs/02_document/components/10_c8_fc_adapter/description.md create mode 100644 _docs/02_document/components/10_c8_fc_adapter/tests.md create mode 100644 _docs/02_document/components/11_c10_provisioning/description.md create mode 100644 _docs/02_document/components/11_c10_provisioning/tests.md create mode 100644 _docs/02_document/components/12_c11_tilemanager/description.md create mode 100644 _docs/02_document/components/12_c11_tilemanager/tests.md create mode 100644 _docs/02_document/components/13_c12_operator_tooling/description.md create mode 100644 _docs/02_document/components/13_c12_operator_tooling/tests.md create mode 100644 _docs/02_document/components/14_c13_fdr/description.md create mode 100644 _docs/02_document/components/14_c13_fdr/tests.md create mode 100644 _docs/02_document/data_model.md create mode 100644 _docs/02_document/deployment/ci_cd_pipeline.md create mode 100644 _docs/02_document/deployment/containerization.md create mode 100644 _docs/02_document/deployment/deployment_procedures.md create mode 100644 _docs/02_document/deployment/environment_strategy.md create mode 100644 _docs/02_document/deployment/observability.md create mode 100644 _docs/02_document/diagrams/components.drawio create mode 100644 _docs/02_document/diagrams/flows/00_index.md create mode 100644 _docs/02_document/epics.md create mode 100644 _docs/02_document/risk_mitigations.md diff --git a/.cursor/rules/large-file-writes.mdc b/.cursor/rules/large-file-writes.mdc new file mode 100644 index 0000000..759ca7f --- /dev/null +++ b/.cursor/rules/large-file-writes.mdc @@ -0,0 +1,41 @@ +--- +description: "Use chunked writes (Write + StrReplace marker pattern) for large generated files, especially after a monolithic Write fails" +alwaysApply: true +--- +# Large File Writes — Chunk on Failure + +When a `Write` call to a single file fails (timeout, payload limit, "Invalid arguments", or any tool error) and the intended content is large (>~500 lines or >~50 KB), do NOT retry the same monolithic Write. Switch to chunked writes: + +1. **First Write** — create the file with header + table of contents (if applicable) + an explicit append marker, e.g. + + ``` + + ``` + +2. **Each subsequent chunk** — use `StrReplace` to replace the marker with `\n` so the marker stays at the end. This is idempotent: if a chunk fails, retry it without losing earlier chunks. + +3. **Final chunk** — `StrReplace` removes the marker. + +## Why + +- Tool argument size limits and transient failures hit large monolithic writes hardest. Retrying the same large payload typically fails for the same reason. +- Chunked writes are recoverable per chunk. The earlier chunks are durable on disk. +- A unique marker is greppable, visible in diffs, and stops accidental insertion in the wrong place. + +## Triggers + +- Generated documentation that aggregates per-component content (epics, design docs, multi-section architecture summaries, traceability dumps). +- Large fixture or test-data files written from a template. +- Any single-file artifact you can pre-estimate at >~500 lines. + +## Do NOT chunk + +- Files under ~200 lines — a single `Write` is faster, clearer, and easier to review. +- Source code files where appending breaks module structure (functions, classes, imports). Split into multiple files instead. +- Files where ordering of sections is computed late and inserting in the middle is required — use a single `Write` once the full content is known. + +## Anti-patterns + +- Retrying the same failed monolithic `Write` more than once. Twice is the limit; on the second failure, switch strategies. +- Using `Shell` with heredoc (`cat <>` to append — these bypass the editor diff view and break the StrReplace contract for the next chunk. +- Embedding the marker so deep inside structured content that a chunk's `StrReplace` becomes ambiguous. Place the marker on its own line at the very end of the file. diff --git a/_docs/02_document/architecture.md b/_docs/02_document/architecture.md index a746938..3664771 100644 --- a/_docs/02_document/architecture.md +++ b/_docs/02_document/architecture.md @@ -7,7 +7,7 @@ > User-confirmed in Plan Phase 2a.0 (2026-05-09). This section is the spine of the document; nothing below it may contradict it without a recorded ADR. -The system is a **Jetson Orin Nano Super-hosted onboard companion** that delivers a GPS-equivalent WGS84 position (with honest 6×6 covariance and provenance label `{satellite_anchored | visual_propagated | dead_reckoned}`) to a fixed-wing UAV's flight controller in GPS-denied or GPS-spoofed environments. It runs as a **single Python-with-C++-extensions monolithic process per binary track** on the companion PC, fusing pre-flight-cached satellite tiles served by the parent-suite `satellite-provider` with live nav-camera frames (3 Hz) and FC-supplied IMU/attitude (100–200 Hz). A canonical hierarchical pipeline `VIO → retrieval → re-rank → matching → AdHoP-conditional refinement → pose → fusion` drives the per-frame loop within a 400 ms p95 latency budget. Cross-component coupling routes through a shared GTSAM substrate so posterior covariance is recovered natively (D-C5-5 = (c)). The companion is **read-only against `satellite-provider` while airborne**; mid-flight tiles are generated locally and uploaded post-landing only via a separate operator-side tool, with the airborne companion image not loading the upload code path at all (process-level isolation, AC-8.4). +The system is a **Jetson Orin Nano Super-hosted onboard companion** that delivers a GPS-equivalent WGS84 position (with honest 6×6 covariance and provenance label `{satellite_anchored | visual_propagated | dead_reckoned}`) to a fixed-wing UAV's flight controller in GPS-denied or GPS-spoofed environments. It runs as a **single Python-with-C++-extensions monolithic process per binary track** on the companion PC, fusing pre-flight-cached satellite tiles served by the parent-suite `satellite-provider` with live nav-camera frames (3 Hz) and FC-supplied IMU/attitude (100–200 Hz). A canonical hierarchical pipeline `VIO → retrieval → re-rank → matching → AdHoP-conditional refinement → pose → fusion` drives the per-frame loop within a 400 ms p95 latency budget. Cross-component coupling routes through a shared GTSAM substrate so posterior covariance is recovered natively (D-C5-5 = (c)). The companion is **read-only against `satellite-provider` while airborne** — both the pre-flight tile download and the post-landing tile upload run from the operator-side `Tile Manager` (C11), a separate binary that is excluded from the airborne CMake target so the companion image cannot load either code path even via reflection or config error (process-level isolation, AC-8.4). ### Components — intent-level (formal decomposition belongs to Step 3) @@ -21,11 +21,10 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver - **C6 — Tile cache + spatial index**: PostgreSQL btree spatial index over filesystem `./tiles/{zoomLevel}/{x}/{y}.jpg` mirroring `satellite-provider`'s on-disk layout, plus FAISS HNSW index for VPR descriptors (`.index` written via `faiss.write_index` + atomicwrites + SHA-256 content-hash gate, D-C10-3). - **C7 — On-Jetson inference runtime**: TensorRT 10.3 engines (Polygraphy / trtexec / IBuilderConfig hybrid orchestration), JetPack 6.2, SM 87; ONNX Runtime + TRT EP fallback; pure PyTorch FP16 baseline. - **C8 — Flight-Controller adapter**: `pymavlink` `GPS_INPUT` for ArduPilot Plane (MAVLink 2.0 message signing on the companion ↔ AP wired channel, D-C8-9 = (d)) and `YAMSPy` / INAV-Toolkit `MSP2_SENSOR_GPS` for iNav (signing-gap accepted residual risk). -- **C10 — Pre-flight cache provisioning**: orchestrates download-from-`satellite-provider` → tile cache; descriptor generation; engine compilation; calibration loading; manifest+content-hash verification (D-C10-1, D-C10-3, D-C10-6, D-C10-7). -- **C11 — Post-landing tile upload tool** (operator-side, distinct binary/image): only loaded when `flight_state == ON_GROUND`; pushes locally-saved mid-flight tiles to `satellite-provider`'s ingest endpoint (D-PROJ-2 contract; not yet implemented service-side). +- **C10 — Pre-flight cache provisioning**: builds the **model-derived** cache artifacts (descriptor generation, engine compilation, manifest + content-hash) on top of an already-populated tile store; F2 takeoff verifier (D-C10-1, D-C10-3, D-C10-6, D-C10-7). C10 does NOT touch `satellite-provider` — tile network I/O lives in C11. +- **C11 — Tile Manager** (operator-side, distinct binary/image, ADR-004 process-isolated): owns operator-side network I/O against `satellite-provider` in **both directions**. `TileDownloader` interface fetches tiles into C6 during F1 (TLS + service-internal API key); `TileUploader` interface, gated on `flight_state == ON_GROUND`, pushes mid-flight tiles to `satellite-provider`'s ingest endpoint (D-PROJ-2 contract; not yet implemented service-side). The component bundles both interfaces because they share auth, HTTP client, deployment unit, and the airborne-exclusion property. - **C12 — Operator pre-flight tooling** (Plan-phase carryforward, deferred from research): cache provisioning UI, sector classification (active-conflict vs stable rear), freshness pipeline workflow. - **C13 — Flight Data Recorder (FDR)**: per-flight ≤64 GB NVM record of estimates + IMU traces + emitted MAVLink + system health + mid-flight tiles + ≤0.1 Hz failed-tile thumbnails; raw nav/AI-cam frames excluded (AC-8.5, AC-NEW-3). -- **`mock-suite-sat-service`** (testing-time component boundary, F7): a process that publishes the `satellite-provider` ingest contract during NFT-SEC-01 / FT-P-17 / IT runs; not a fixture but a real component boundary. - **External: `satellite-provider`** (parent-suite .NET 8 service): tile producer pre-flight; tile sink post-landing (D-PROJ-2). Treated as a planned external dependency on the upload + voting paths. ### Architectural principles / non-negotiables @@ -33,7 +32,7 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver 1. **Camera-specific math enters only via a `Camera calibration artifact` JSON** (intrinsics + distortion + body-to-camera extrinsics + acquisition method `factory_sheet | checkerboard_refined | hybrid`). No hard-coded camera math anywhere; test fixtures (`adti26`) and production deployments (`adti20`) load different artifacts on the same code path. 2. **VioStrategy is selected at startup via config; not hot-swappable mid-flight.** 3. **Build-time exclusion of unused `Strategy` implementations.** A given binary links only the implementations it actually uses at runtime. The default deployment binary links the production-default strategies (e.g. OKVIS2 on C1) plus the engine-rule-mandatory simple-baseline (KltRansac on C1); the IT-12 comparative-study binary links all C1 implementations side-by-side. The mechanism is per-component CMake `BUILD_*` flags (`BUILD_VINS_MONO`, `BUILD_SALAD`, …) plus the per-binary composition root choosing among the linked implementations at startup. **Justification is technical** — binary size on the 8 GB shared Jetson, boot/load time inside the AC-NEW-1 30 s budget, deployed dependency / attack surface, and accidental-selection risk reduction (a binary with only OKVIS2 + KltRansac linked cannot be misconfigured into running VINS-Mono). **Component licenses do not drive this decision** — see ADR-002. CI emits both the deployment binary and the research binary on every PR. -4. **In-air outbound writes to `satellite-provider` are forbidden.** Enforced primarily by **process-level isolation** — the upload daemon (C11) is not loaded in the airborne companion image. Software guard on `flight_state == ON_GROUND` is a defense-in-depth check, not the primary control. +4. **In-air network I/O against `satellite-provider` is forbidden — in BOTH directions.** Enforced primarily by **process-level isolation** — the Tile Manager (C11), which carries both the `TileDownloader` and the `TileUploader` interfaces, is not loaded in the airborne companion image. Software guard on `flight_state == ON_GROUND` (upload) is a defense-in-depth check, not the primary control. The companion is read-only against C6 in flight; both pre-flight tile fetching and post-landing tile upload happen on the operator workstation. 5. **All persistent imagery is in `satellite-provider`'s on-disk tile format** (`./tiles/{zoomLevel}/{x}/{y}.jpg` + matching metadata) so post-landing upload is byte-identical. No raw frames on disk except the AC-8.5 forensic ≤0.1 Hz failed-tile thumbnail log inside FDR. 6. **Honest 6×6 posterior covariance via GTSAM `Marginals`** is the safety floor for AC-NEW-4 and AC-NEW-7. Under-reported `horiz_accuracy` is a defect, not a tuning knob. 7. **MAVLink 2.0 message signing on the companion ↔ ArduPilot wired channel**, with per-flight key rotation (D-C8-9 = (d)). iNav has no signing equivalent — accepted residual risk, Plan-phase carryforward proposes an iNav firmware feature request. @@ -47,7 +46,7 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver ### Open architectural items (tracked, NOT blocking Phase 2a) - **D-PROJ-1** (camera calibration acquisition): CLOSED in this Plan cycle as hybrid factory + checkerboard refinement (~1 day per deployed unit). No physical hardware available this cycle, so production calibration is documented as instructions only; runtime path uses test-fixture calibration for `adti26` images. -- **D-PROJ-2** (parent-suite `satellite-provider` ingest endpoint + multi-flight voting layer): open, parent-suite work, tracked in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md`. Onboard-side proceeds with `mock-suite-sat-service` standing in for the contract. +- **D-PROJ-2** (parent-suite `satellite-provider` ingest endpoint + multi-flight voting layer): open, parent-suite work, tracked in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md`. Onboard-side proceeds against the real `satellite-provider` — and uses an e2e-test-only `mock-suite-sat-service` fixture (under `tests/fixtures/`) to stand in for the not-yet-shipped POST contract during integration tests. - **D-PROJ-3** (multi-flight fixture acquisition for AC-NEW-4 / AC-NEW-7 statistical headroom): not pursued this cycle; AC-text was relaxed 2026-05-09 to Monte-Carlo-over-current-data with stated 95 % CI; multi-flight statistical headroom is residual risk in the Step 4 risk register. - **D-C8-2 runtime gate** (companion-driven `MAV_CMD_SET_EKF_SOURCE_SET` switch): pattern is firmware-supported but not deployed-precedent. ArduPilot Plane SITL validation (IT-3) is the lock gate; D-C8-2-FALLBACK options recorded. - **D-C2-12** (DINOv2-feature-based matcher evaluation): carryforward research item; potentially closes D-C3-1 retrain cost. @@ -64,12 +63,11 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver |---|---| | Companion PC runtime (Jetson Orin Nano Super, JetPack 6.2) | Flight controller firmware (ArduPilot Plane, iNav) | | All onboard pose-estimation logic (C1–C8, C13) | Parent-suite `satellite-provider` (.NET 8 REST microservice) | -| Pre-flight cache provisioning (C10) | GCS (QGroundControl) | -| Operator-side post-landing upload tool (C11) | Nav camera hardware (`adti20`); AI-camera hardware | +| Pre-flight cache artifact build (C10 — engines + descriptors + manifest) | GCS (QGroundControl) | +| Operator-side Tile Manager (C11 — pre-flight download + post-landing upload) | Nav camera hardware (`adti20`); AI-camera hardware | | Operator pre-flight tooling (C12) | UAV airframe / FC IMU / sensors | | FDR writer (C13) | Operator's workstation OS / authentication | | Camera calibration artifact format + loader | The act of calibration itself (operator runs checkerboard rig) | -| `mock-suite-sat-service` (testing-time, F7) | Cellular / satellite GCS link bandwidth | **External systems**: @@ -103,7 +101,7 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver | VIO (production) | OKVIS2 (BSD-3-Clause) | upstream HEAD pinned per Plan-phase | D-C1-1-SUB-A = (a) production-default | | VIO (research / IT-12) | VINS-Mono | upstream HEAD pinned per Plan-phase | Research binary only (`BUILD_VINS_MONO=ON`) for IT-12 comparative study; build-time exclusion from deployment binary per ADR-002 | | VIO (mandatory baseline) | KLT+RANSAC over OpenCV | OpenCV ≥ 4.12.0 | Engine-rule-required mandatory simple-baseline | -| Tile cache backend | PostgreSQL + filesystem | PostgreSQL 16 (mirror of `satellite-provider`) | C6 mirrors `satellite-provider`'s on-disk and table layout for byte-identical post-landing upload | +| Tile cache backend | PostgreSQL + filesystem | PostgreSQL 16 (mirror of `satellite-provider`) | C6 mirrors `satellite-provider`'s on-disk and table layout so C11 `TileUploader`'s post-landing payload is byte-identical to what the parent suite already serves | | Container runtime | Docker (Tier-1) + bare JetPack (Tier-2) | Docker 27.x; JetPack 6.2 | Tier-1 workstation Docker; Tier-2 Jetson native (no Docker — direct JetPack to keep INT8 calibration cache trustworthy per D-C10-6) | | Build system | CMake + Python `pyproject.toml` | CMake ≥ 3.27 | CMake `option(BUILD_VINS_MONO ...)` D-C1-1-SUB-A; Python wheels built per Jetson via cibuildwheel-equivalent recipe | | CI/CD | GitHub Actions (Tier-1) + self-hosted Jetson runner (Tier-2) | latest pinned action versions | Two-binary emit on every PR (production + research); Tier-2 runs are AC-bound jobs only | @@ -132,11 +130,11 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver | `staging-tier1` | CI runs that don't require Jetson hardware | GitHub-hosted runner (x86_64); Docker | | `staging-tier2` | CI runs that require Jetson (AC-bound jobs only) | Self-hosted Jetson runner; bare JetPack (no Docker) | | `production` | Deployed companion image on a UAV | Jetson Orin Nano Super (pinned); bare JetPack; no inbound network listening (defense-in-depth, NFT-SEC-05) | -| `production-operator-workstation` | Pre-flight cache build + post-landing upload + FDR retrieval | Operator's Linux workstation; Docker for `satellite-provider` mirror | +| `production-operator-workstation` | Pre-flight tile download + cache artifact build (C10) + post-landing tile upload (C11) + FDR retrieval | Operator's Linux workstation; Docker for `satellite-provider` mirror | **Infrastructure**: -- **No cloud orchestration**. The companion is an embedded edge device; the operator's workstation is a single host that runs the operator tooling (C11, C12) and a local `satellite-provider` mirror or VPN-reaches the lab `satellite-provider`. +- **No cloud orchestration**. The companion is an embedded edge device; the operator's workstation is a single host that runs the operator tooling (C11 Tile Manager + C12 Operator Pre-flight Tooling) and a local `satellite-provider` mirror or VPN-reaches the lab `satellite-provider`. - **Two binaries shipped on every PR** (ADR-002): `deployment-binary` (links the production-default strategy on each component + the mandatory simple-baseline; CMake `BUILD_VINS_MONO=OFF`, `BUILD_SALAD=OFF`, …) and `research-binary` (links every available strategy on every component; all `BUILD_*` flags `ON`, used for the IT-12 comparative study). The deployment binary is what installs onto an operational Jetson; the research binary runs on dev/lab Jetson hardware for the comparative-study report. The same code base produces both — ADR-002 mechanism scales to additional binary variants later if packaging strategy requires it. - **Container scope**: Tier-1 uses Docker (`docker compose` for the developer setup including a `mock-suite-sat-service` container, the operator-tool container, and a Postgres for C6). **Tier-2 (Jetson) does NOT use Docker** — TensorRT INT8 calibration caches and `jetson-stats` thermal telemetry are most reliable without a container layer, per D-C7-9 + D-C10-6. The deployed image on the Jetson is a JetPack-based system image with the deployment binary preinstalled. - **Scaling**: not applicable (per-UAV, single companion). Failover is per-airframe (the FC's IMU-only fallback at AC-5.2 is the system's "scale-out"). @@ -145,7 +143,7 @@ The system is a **Jetson Orin Nano Super-hosted onboard companion** that deliver | Config | dev-tier1 | staging-tier2 | production | |---|---|---|---| -| `satellite-provider` host | local Docker (`satellite-provider:5100`) | mock-suite-sat-service or test fixture | operator workstation (pre-flight only) | +| `satellite-provider` host | local Docker (`satellite-provider:5100`) | real `satellite-provider` Docker (download path; existing) + e2e-test `mock-suite-sat-service` fixture (POST/upload only, until D-PROJ-2 lands) | operator workstation (pre-flight only) | | Camera calibration source | test-fixture artifact (`adti26.json`) | test-fixture artifact | `adti20.json` (D-PROJ-1 hybrid output) | | Logging sink | console (DEBUG) | journald + FDR | FDR (per-flight, ≤ 64 GB rolling) | | MAVLink signing key | dev key (committed to test fixtures) | per-flight key from test config | per-flight key generated at takeoff load, rotated per flight | @@ -167,7 +165,7 @@ source repo │ ├─ deployment-binary tarball (production-default strategies + mandatory baselines, ADR-002) │ ├─ research-binary tarball (all strategies linked; for IT-12 comparative study) │ ├─ JetPack image (deployment-binary preinstalled) - │ └─ operator-tooling tarball (C11 + C12 + mock-suite-sat-service compose) + │ └─ operator-tooling tarball (C11 + C12 + e2e-test mock-suite-sat-service compose for offline integration testing) │ └─→ deploy paths: ├─ Jetson operational deploy: JetPack image flash (deployment-binary) @@ -215,13 +213,13 @@ source repo **Data flow summary** (one-line each; full sequences in `system-flows.md`): -- Pre-flight: `satellite-provider` → C10 → `Tile` cache + `EngineCacheEntry` + `Manifest` + descriptor `.index` (atomic write + content-hash gate). +- Pre-flight: `satellite-provider` → C11 `TileDownloader` → `Tile` cache (C6) → C10 → `EngineCacheEntry` + `Manifest` + descriptor `.index` (atomic write + content-hash gate). - Takeoff load: `Manifest` content-hash verify + FAISS mmap + TRT deserialize + MAVLink signing handshake → ready. - Per-frame runtime: `NavCameraFrame` + `ImuWindow` → C1 (`VioOutput`) → C2 → C2.5 → C3 → C3.5 → C4 → C5 → C8 → `EmittedExternalPosition` to FC. - Mid-flight tile gen: `NavCameraFrame` + `PoseEstimate` → orthorectify → dedup → write to local C6 (no upload). - GCS telemetry: C5 → C8 → 1–2 Hz downsampled summary to QGroundControl. - FDR: every emitted/received stream → C13 ring with per-flight ≤ 64 GB cap. -- Post-landing: operator triggers C11 → reads C6 → uploads to `satellite-provider` ingest endpoint (D-PROJ-2 contract). +- Post-landing: operator triggers C11 `TileUploader` → reads C6 → uploads to `satellite-provider` ingest endpoint (D-PROJ-2 contract). --- @@ -252,7 +250,7 @@ source repo | ArduPilot Plane FC | MAVLink 2.0 (`GPS_INPUT` 5 Hz; `MAV_CMD_SET_EKF_SOURCE_SET`; `STATUSTEXT` / `NAMED_VALUE_FLOAT`) over UART/USB | MAVLink 2.0 message signing, per-flight key (D-C8-9 = (d)) | 5 Hz periodic emit; signing handshake at takeoff load (≤ 5 s, AC-NEW-1) | Signing handshake fail → companion refuses takeoff; mid-flight signing key compromise → FC ignores unsigned messages, AC-5.2 takes over | | iNav FC | MSP2 `MSP2_SENSOR_GPS` over UART; MAVLink outbound for telemetry | None (iNav has no signing) — accepted residual risk per Mode B Source #129 | 5 Hz periodic emit | Mid-flight bad-frame → iNav `mspGPSReceiveNewData()` receives only the latest frame; honest `hPosAccuracy` is the only safety net | | QGroundControl (GCS) | MAVLink 2.0 (`STATUSTEXT`, `NAMED_VALUE_FLOAT`, `GPS_RAW_INT`) | Same MAVLink 2.0 signing as the AP path (AP profile); no signing on iNav profile | 1–2 Hz downsampled (AC-6.1); operator commands are best-effort | GCS link drop → companion continues; no mid-flight reconfiguration is required from GCS | -| `satellite-provider` (pre-flight) | REST over HTTP, OpenAPI at `/swagger`; filesystem access if co-located | TLS + service-internal API key (operator workstation only); the companion never reaches `satellite-provider` directly while airborne | Off-line pre-flight; not time-critical | Cache miss → C10 fails fast pre-flight; takeoff blocked | +| `satellite-provider` (pre-flight) | REST over HTTP, OpenAPI at `/swagger`; filesystem access if co-located | TLS + service-internal API key (operator workstation only); the companion never reaches `satellite-provider` directly while airborne | Off-line pre-flight; not time-critical | Cache miss → C11 `TileDownloader` fails fast pre-flight; C10 build is blocked downstream; takeoff blocked | | `satellite-provider` (post-landing ingest, D-PROJ-2, **planned**) | REST `POST /api/satellite/tiles/ingest` (multipart) | Per-flight onboard signing key (carried with each tile); rate-limited | Bursty post-landing | Endpoint not yet implemented service-side → C11 keeps batches queued locally; never blocks the pre-flight cycle | | Operator workstation (pre-flight stage) | Filesystem (USB / Ethernet) | OS-level (operator login) | Not time-critical | Bad-stage detection via Manifest content-hash gate (D-C10-3) | | Nav camera | USB / MIPI-CSI / GigE (lens-module dependent) | n/a | 3 Hz | Frame drop / hardware fault → "VISUAL_BLACKOUT" path (AC-3.5, AC-NEW-8) | @@ -264,7 +262,7 @@ The onboard side of D-PROJ-2 is fully specified in `_docs/_process_leftovers/202 - **`Tile` writes are append-only and idempotent** (the same `(zoomLevel, lat, lon, capture_timestamp, companion_id, flight_id)` tuple is the dedup key). - **Quality metadata is mandatory on every uploaded tile** so the planned voting layer can promote `pending → trusted` without re-deriving statistics on the service side. - **Onboard tiles never claim the `trusted` status**; they are uploaded as `pending` and the parent-suite voting layer (D-PROJ-2 design task #2) decides promotion. -- **Mock substitute**: `mock-suite-sat-service` (testing-time component, F7) implements the contract for NFT-SEC-01 / FT-P-17 / IT runs — it is treated as a real component boundary, not a test fixture. +- **Test substitute**: `mock-suite-sat-service` is an e2e-test-only fixture (under `tests/fixtures/mock-suite-sat-service/`) that implements the upload contract for NFT-SEC-01 / FT-P-17 / IT runs until D-PROJ-2 lands service-side. It is **not a component** in the architectural sense — the production architectural counterparty for both download and upload is the real `satellite-provider`. The fixture is retired the moment the real ingest endpoint ships. --- @@ -319,7 +317,7 @@ The onboard side of D-PROJ-2 is fully specified in `_docs/_process_leftovers/202 **Authorization**: -- **Onboard runtime**: a single principal (the runtime process); no in-process privilege boundaries. The post-landing upload tool (C11) runs as a different principal with write access to `satellite-provider` and only loads when `flight_state == ON_GROUND`. The airborne image does not contain the C11 binary at all. +- **Onboard runtime**: a single principal (the runtime process); no in-process privilege boundaries. The Tile Manager (C11) runs as a different principal on the operator workstation, holding the only credentials that reach `satellite-provider` (TLS API key for download; per-flight onboard signing key for post-landing upload). The airborne image does not contain the C11 binary at all. - **GCS**: operator commands (`AC-6.2`) are best-effort hints; the operator cannot promote a pose, override covariance, or reach the `satellite-provider` write path. Operator re-loc requests trigger the satellite re-localization flow (F6) but do not bypass any safety gate. **Data protection**: @@ -405,9 +403,14 @@ This decision is made on **technical grounds only**. Component licenses (BSD/Apa ### ADR-004 — Process-level isolation for in-air upload prevention (AC-8.4 enforcement) -**Context**: AC-8.4 forbids in-air outbound writes to `satellite-provider` for drone-security reasons. A software guard checking `flight_state == ON_GROUND` can be bypassed by code injection if the upload code path is ever loaded. +**Context**: AC-8.4 forbids in-air outbound writes to `satellite-provider` for drone-security reasons. The companion is also read-only against `satellite-provider` while airborne — there is no operational reason to fetch tiles in flight either, since the pre-flight cache is the contract. A software guard checking `flight_state == ON_GROUND` can be bypassed by code injection if the network I/O code path is ever loaded. -**Decision**: The post-landing upload tool (C11) is a **separate binary / image** that runs only on the operator's workstation post-landing; the airborne companion image does not contain the C11 binary at all. The `flight_state == ON_GROUND` software guard remains as defense-in-depth. The local mid-flight tile format is byte-identical to `satellite-provider`'s on-disk layout so no transformation is needed at upload time. +**Decision**: The Tile Manager (C11) is a **separate binary / image** that runs only on the operator's workstation; the airborne companion image does not contain the C11 binary at all — neither the `TileDownloader` (pre-flight) nor the `TileUploader` (post-landing) code paths can be reached from the airborne process. The `flight_state == ON_GROUND` software guard inside the `TileUploader` remains as defense-in-depth for the upload direction. The local mid-flight tile format is byte-identical to `satellite-provider`'s on-disk layout so no transformation is needed at upload time. + +**Enforcement gates (per R02 risk register)**: +1. **CI SBOM diff**: the build pipeline fails the airborne `production-binary` artifact if any symbol from `c11_tilemanager/` (or any module that transitively imports `c11_tilemanager`) appears in the linked image. This is an extension of the per-implementation SBOM enforcement already in ADR-002. +2. **Runtime self-check in `runtime_root.py`**: at startup, before opening the FC adapter, the airborne composition root attempts `importlib.util.find_spec("c11_tilemanager")` and panics if the spec resolves to anything other than `None`. Cost: one import lookup at startup; benefit: catches a build-system regression even if SBOM diff was bypassed. +3. **Network egress test (NFT-SEC-02)**: the airborne process is run inside a network namespace with no route to `satellite-provider`'s host; any attempted outbound TCP connection to it is a release-blocking test failure. **Alternatives considered**: 1. Single binary with software-only guard — rejected on principle: a runtime guard cannot be the primary control for an "is the system airborne?" safety property. @@ -439,17 +442,19 @@ This decision is made on **technical grounds only**. Component licenses (BSD/Apa **Consequences**: ~5–10 % accuracy loss at the upper thermal envelope (still inside AC-NEW-4). The hybrid is part of the runtime, not a config knob; the threshold is. -### ADR-007 — `mock-suite-sat-service` is a real component boundary, not a fixture (F7) +### ADR-007 — `mock-suite-sat-service` is an e2e-test fixture, not a first-class component (REVERSED 2026-05-09) -**Context**: D-PROJ-2 (parent-suite ingest endpoint + voting layer) is not yet implemented. NFT-SEC-01 / FT-P-17 / IT runs need a counterparty for the post-landing upload contract. Treating the counterparty as a "fixture" buried inside C8 hides the actual contract. +**Context**: D-PROJ-2 (parent-suite ingest endpoint + voting layer) is not yet implemented. NFT-SEC-01 / FT-P-17 / IT runs need a counterparty for the post-landing upload contract. An earlier iteration of this ADR promoted the mock to a first-class component boundary peer of `satellite-provider`, with its own description under `components/` and its own deployable image — to make the contract auditable. -**Decision**: `mock-suite-sat-service` is documented as a testing-time component boundary with its own description in `components/`. Component decomposition (Step 3) treats it as a peer of `satellite-provider`, not a sub-implementation of C8. The mock implements the contract sketched in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md`. +**Decision (current)**: the mock is **an e2e-test fixture only**, scoped under `tests/fixtures/mock-suite-sat-service/`. The architectural counterparty for both the existing download path and the planned D-PROJ-2 upload path is the **real** `satellite-provider`. The contract sketch lives in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md` (the source of truth for the parent-suite work) and is mirrored in C11 Tile Manager's external API section (the onboard consumer's view). The mock implements that contract in tests; production never reaches it. + +**Why reversed**: promoting an e2e-test fixture to a component boundary inflated the architectural surface and risked the test fixture drifting away from the real contract once D-PROJ-2 lands. The contract sketch in the leftover file is sufficient as the auditable source of truth without a separate component spec. **Alternatives considered**: -1. Embed mock inside C8 test fixtures — rejected: hides the contract. -2. Wait for D-PROJ-2 service-side implementation — rejected: blocks the onboard cycle. +1. Keep ADR-007 as originally written — rejected: see "Why reversed". +2. Wait for D-PROJ-2 service-side implementation before any tests — rejected: blocks the onboard cycle. -**Consequences**: The mock is a release artifact (not a test-only fixture); it is the source of truth for the onboard side of the D-PROJ-2 contract until the parent suite catches up. When `satellite-provider` ships the real endpoint, the mock is retired and the contract is replayed against the real service. +**Consequences**: The mock continues to ship in the operator-tooling tarball's compose file as a test-time service, but it is no longer documented under `_docs/02_document/components/`. Test specs and CI references treat it as a fixture. When `satellite-provider` ships the real endpoint, the fixture is replaced by pointing tests at the real service; no architectural changes flow from that switch. ### ADR-008 — D-C8-2 source-set switch is `Selected with runtime gate` (Mode B Fact #111) @@ -533,22 +538,23 @@ This decision is made on **technical grounds only**. Component licenses (BSD/Apa msp2_inav_adapter.py qgc_telemetry_adapter.py tests/ - c10_cache_provisioning/ - interface.py # CacheProvisioner, ManifestVerifier - provisioner.py - tests/ - c11_post_landing_upload/ # SEPARATE BINARY — never linked into airborne image - interface.py - uploader.py - tests/ + c10_cache_provisioning/ + interface.py # CacheProvisioner, ManifestVerifier + provisioner.py + tests/ + c11_tilemanager/ # SEPARATE BINARY — never linked into airborne image + interface.py # TileDownloader, TileUploader (two interfaces in one component) + http_tile_downloader.py + http_tile_uploader.py + tests/ c13_fdr/ interface.py # FdrWriter file_fdr_writer.py tests/ composition/ - runtime_root.py # composition root: config -> concrete graph - upload_tool_root.py # composition root for the C11 operator-side tool - research_root.py # composition root for the research/dev binary + runtime_root.py # composition root: config -> concrete graph + tilemanager_root.py # composition root for the C11 operator-side tool (download + upload) + research_root.py # composition root for the research/dev binary ``` 3. **Constructor injection only.** Every component class declares its collaborators as **typed `__init__` arguments**, against the sibling's interface (not the concrete class). Example sketch: diff --git a/_docs/02_document/common-helpers/01_helper_imu_preintegrator.md b/_docs/02_document/common-helpers/01_helper_imu_preintegrator.md new file mode 100644 index 0000000..44f1366 --- /dev/null +++ b/_docs/02_document/common-helpers/01_helper_imu_preintegrator.md @@ -0,0 +1,32 @@ +# Common Helper — `ImuPreintegrator` + +## Purpose + +Shared GTSAM `CombinedImuFactor` preintegration buffer. Both C1 (VIO) and C5 (StateEstimator) consume the same FC IMU window and want the same preintegrated Δ-pose / Δ-velocity / Δ-bias quantities. Centralising the preintegrator avoids two inconsistent integrations of the same IMU stream. + +## Used By + +- C1 — Visual / Visual-Inertial Odometry. +- C5 — State Estimator. + +## Interface (sketch) + +``` +class ImuPreintegrator: + def reset_with_bias(bias: ImuBias) -> None + def integrate_sample(sample: ImuSample) -> None + def integrate_window(window: ImuWindow) -> None + def current_preintegration() -> CombinedImuFactor + def reset_for_new_keyframe() -> CombinedImuFactor # returns the closed factor; clears state +``` + +## Implementation Notes + +- Wraps GTSAM's `PreintegrationCombinedParams` + `PreintegratedCombinedMeasurements`. +- Holds the gyro/accel noise model from the camera-calibration artifact (which carries IMU-noise covariances per-deployment). +- Single-threaded by design — composition root binds one preintegrator instance to one writer thread. + +## Caveats + +- Bias drift is the responsibility of the consumers (C1 + C5) who call `reset_with_bias(...)` whenever their estimate of the IMU bias changes. +- The preintegrator does not own a clock — every `integrate_*` call requires a monotonic timestamp on the IMU sample. diff --git a/_docs/02_document/common-helpers/02_helper_se3_utils.md b/_docs/02_document/common-helpers/02_helper_se3_utils.md new file mode 100644 index 0000000..ea1d75b --- /dev/null +++ b/_docs/02_document/common-helpers/02_helper_se3_utils.md @@ -0,0 +1,30 @@ +# Common Helper — `SE3Utils` + +## Purpose + +SE(3) ↔ pose-matrix conversion and Lie-algebra exponential/logarithm. Used wherever a 4×4 transformation matrix needs to be converted to/from a 6-vector, or where Jacobians of SE(3) operations are needed for covariance recovery. + +## Used By + +- C1 — Visual / Visual-Inertial Odometry (relative pose updates). +- C4 — Pose Estimation (`solvePnPRansac` 4×4 → SE(3) for the GTSAM factor). +- C5 — State Estimator (iSAM2 graph keys + smoothed history). + +## Interface (sketch) + +``` +def matrix_to_se3(T_4x4: ndarray) -> SE3 +def se3_to_matrix(pose: SE3) -> ndarray +def exp_map(xi: Vector6) -> SE3 +def log_map(pose: SE3) -> Vector6 +def adjoint(pose: SE3) -> Matrix6 +``` + +## Implementation Notes + +- Backed by GTSAM `Pose3` + Eigen Lie-algebra primitives where available; otherwise pure numpy. +- All-positive-determinant rotation guarantee — caller is responsible for orthogonalising input rotation matrices before calling `matrix_to_se3`. + +## Caveats + +- Library-grade Lie-algebra functions exist in `manifpy` and `pylie`; we use GTSAM's primitives directly to avoid pulling in a second math library. If a future strategy needs richer manifold ops, evaluate `manifpy` then. diff --git a/_docs/02_document/common-helpers/03_helper_lightglue_runtime.md b/_docs/02_document/common-helpers/03_helper_lightglue_runtime.md new file mode 100644 index 0000000..291612d --- /dev/null +++ b/_docs/02_document/common-helpers/03_helper_lightglue_runtime.md @@ -0,0 +1,30 @@ +# Common Helper — `LightGlueRuntime` + +## Purpose + +Shared LightGlue inference handle. C2.5 (Re-rank) does single-pair LightGlue matching for inlier counting on K=10 candidates per frame; C3 (CrossDomainMatcher) does the heavier matching pass on the surviving N=3 candidates. Both use the same LightGlue engine; sharing the engine avoids paying the engine-build / GPU-memory cost twice. + +## Used By + +- C2.5 — Inlier-based Re-rank. +- C3 — Cross-domain Matcher. + +## Interface (sketch) + +``` +class LightGlueRuntime: + def __init__(engine_handle: EngineHandle): ... + def match(features_a: KeypointSet, features_b: KeypointSet) -> CorrespondenceSet + def match_batch(features_a_list, features_b_list) -> list[CorrespondenceSet] + def descriptor_dim() -> int +``` + +## Implementation Notes + +- Owned by the composition root; the same instance is constructor-injected into both C2.5 and C3. +- Backed by C7's `InferenceRuntime.deserialize_engine(LIGHTGLUE_ENGINE_CACHE_ENTRY)` at takeoff. +- Single CUDA stream; concurrent calls forbidden — composition root binds the runtime to the single F3 hot-path thread. + +## Caveats + +- The features fed in MUST come from the same backbone as the LightGlue engine was trained for (DISK in production-default; ALIKED / XFeat in alternates). Mixing backbones is a runtime error caught by the matcher's input shape check. diff --git a/_docs/02_document/common-helpers/04_helper_wgs_converter.md b/_docs/02_document/common-helpers/04_helper_wgs_converter.md new file mode 100644 index 0000000..0d33f3f --- /dev/null +++ b/_docs/02_document/common-helpers/04_helper_wgs_converter.md @@ -0,0 +1,43 @@ +# Common Helper — `WgsConverter` + +## Purpose + +WGS84 ↔ local tangent-plane (ENU/NED) ↔ tile pixel-coordinate conversions. Required by every component that interacts with geographic positions — from C4's pose estimation, through C5's state graph, through C6's tile-bounding-box queries, through C8's per-FC encoding, through C10's bbox provisioning, through C12's operator UX. + +## Used By + +- C4 — Pose Estimation. +- C5 — State Estimator. +- C6 — Tile Cache + Spatial Index (bbox queries). +- C8 — FC Adapter (per-FC encoding of LatLonAlt → MAVLink/MSP2). +- C10 — Pre-flight Cache Provisioning (bbox → tile-id list). +- C12 — Operator Pre-flight Tooling (operator-entered bbox). + +## Interface (sketch) + +``` +class WgsConverter: + @staticmethod + def latlonalt_to_ecef(p: LatLonAlt) -> Vector3 + @staticmethod + def ecef_to_latlonalt(p: Vector3) -> LatLonAlt + @staticmethod + def latlonalt_to_local_enu(origin: LatLonAlt, p: LatLonAlt) -> Vector3 + @staticmethod + def local_enu_to_latlonalt(origin: LatLonAlt, p_enu: Vector3) -> LatLonAlt + @staticmethod + def latlon_to_tile_xy(zoom: int, lat: float, lon: float) -> tuple[int, int] + @staticmethod + def tile_xy_to_latlon_bounds(zoom: int, x: int, y: int) -> BoundingBox +``` + +## Implementation Notes + +- Stateless; pure functions. +- Backed by `pyproj` for the geodesy primitives; tile_xy math uses the standard slippy-map convention (matches `satellite-provider`'s on-disk layout). +- All conversions use WGS84 ellipsoid; no datum-shift complexity. + +## Caveats + +- The static-only design satisfies the coderule.mdc constraint ("only use static methods for pure self-contained computations"). If a future deployment needs alternative datum support, switch to an instance-based factory then. +- Tile-coordinate math is zoom-level-sensitive; callers MUST pass the right zoom level for the tile in question (typically zoomLevel from `TileMetadata`). diff --git a/_docs/02_document/common-helpers/05_helper_sha256_sidecar.md b/_docs/02_document/common-helpers/05_helper_sha256_sidecar.md new file mode 100644 index 0000000..99e0363 --- /dev/null +++ b/_docs/02_document/common-helpers/05_helper_sha256_sidecar.md @@ -0,0 +1,36 @@ +# Common Helper — `Sha256Sidecar` + +## Purpose + +Atomic-write + SHA-256 content-hash sidecar pattern (D-C10-3). Every persistent artifact that takeoff-load (F2) verifies must be written atomically AND have a `.sha256` sidecar that the verifier can independently recompute. Centralising the pattern avoids two slightly-different implementations across C6 (FAISS index, tile metadata) and C7 (engine + calibration cache) and C10 (Manifest itself). + +## Used By + +- C6 — Tile Cache + Spatial Index (FAISS `.index`, descriptor sidecar; tile pixels do NOT use sidecars individually — there are too many; the Manifest covers the tile-tree hash collectively). +- C7 — Inference Runtime (engine cache files + INT8 calibration cache; D-C10-6 calibration-cache trust depends on this). +- C10 — Pre-flight Cache Provisioning (Manifest itself; aggregate hash of the cache root). + +## Interface (sketch) + +``` +class Sha256Sidecar: + @staticmethod + def write_atomic(path: Path, payload: bytes) -> sha256 + @staticmethod + def write_atomic_and_sidecar(path: Path, payload: bytes) -> sha256 + @staticmethod + def verify(path: Path) -> bool # checks payload hash against sidecar + @staticmethod + def aggregate_hash(paths: list[Path]) -> sha256 # for Manifest covering many files +``` + +## Implementation Notes + +- Backed by the `atomicwrites` package for atomic rename and Python's `hashlib.sha256` for digesting. +- Sidecars are written as `.sha256` containing the hex digest (no JSON wrapper — keeps verification trivial). +- `aggregate_hash` is order-deterministic (sorts paths first) so two runs that read the same files yield the same aggregate. + +## Caveats + +- The atomic rename is filesystem-level — works on POSIX local filesystems, not on NFS / SMB / overlayfs. For production deployments the cache root MUST live on a local filesystem. +- The sidecar is NOT cryptographically signed; it protects against accidental corruption + file-replacement-after-staging, NOT against an attacker with write access to the cache root. Threat model treats the operator workstation as trusted; the companion's write access is restricted to F4 (mid-flight tile gen) which has its own per-flight signing key path. diff --git a/_docs/02_document/common-helpers/06_helper_engine_filename_schema.md b/_docs/02_document/common-helpers/06_helper_engine_filename_schema.md new file mode 100644 index 0000000..75c83aa --- /dev/null +++ b/_docs/02_document/common-helpers/06_helper_engine_filename_schema.md @@ -0,0 +1,50 @@ +# Common Helper — `EngineFilenameSchema` + +## Purpose + +Self-describing `.engine` filename schema per D-C10-7. TensorRT engines are NOT portable across `(SM, JetPack, TRT, precision)` tuples; the filename schema makes mismatch instantly visible at takeoff load (F2) so refusing-to-deserialize-on-mismatch becomes trivial. + +## Used By + +- C7 — Inference Runtime (writes engines with this schema; reads them on `deserialize_engine`). +- C10 — Pre-flight Cache Provisioning (compiles engines via C7 with this schema; writes them to the cache root). + +## Interface (sketch) + +``` +class EngineFilenameSchema: + @staticmethod + def build(model_name: str, sm: int, jetpack: str, trt: str, precision: str) -> str + @staticmethod + def parse(filename: str) -> EngineCacheKey + @staticmethod + def matches_host(filename: str, host_capabilities: HostCapabilities) -> bool +``` + +Filename format: `{model}__sm{SM}_jp{JP_dotted}_trt{TRT_dotted}_{precision}.engine` + +Example: `ultravpr__sm87_jp6.2_trt10.3_fp16.engine` + +``` +EngineCacheKey: + model_name: string + sm: int (e.g. 87 for Jetson Orin Nano Super) + jetpack: string (e.g. "6.2") + trt: string (e.g. "10.3") + precision: enum {fp16, int8, mixed} + +HostCapabilities: + current_sm: int + current_jetpack: string + current_trt: string +``` + +## Implementation Notes + +- Stateless; pure string parsing. +- `matches_host` returns true iff every tuple element matches exactly. F2 takeoff load uses this to decide which engines to deserialize and which to refuse. + +## Caveats + +- The dotted-version format must round-trip cleanly through filesystems (no `/` or `\` in dotted versions; safe). +- Adding a new tuple dimension (e.g., a per-binary `BUILD_*` flag combination) requires extending the schema AND every existing `.engine` filename. Versioning the schema itself is a Plan-phase carryforward if/when needed. diff --git a/_docs/02_document/common-helpers/07_helper_ransac_filter.md b/_docs/02_document/common-helpers/07_helper_ransac_filter.md new file mode 100644 index 0000000..d6fb993 --- /dev/null +++ b/_docs/02_document/common-helpers/07_helper_ransac_filter.md @@ -0,0 +1,49 @@ +# Common Helper — `RansacFilter` + +## Purpose + +Thin wrapper around OpenCV's RANSAC + reprojection-residual computation. Used by the cross-domain matcher (C3 — RANSAC over 2D-2D correspondences for the per-candidate inlier count), the conditional refiner (C3.5 — recompute residual after AdHoP refinement), and the pose estimator (C4 — RANSAC inside `solvePnPRansac` is OpenCV-internal, but C4 also computes per-frame final reprojection residual via this helper for FDR provenance). + +## Used By + +- C3 — Cross-domain Matcher. +- C3.5 — AdHoP-conditional Refiner. +- C4 — Pose Estimation. + +## Interface (sketch) + +``` +class RansacFilter: + @staticmethod + def filter_correspondences( + correspondences: ndarray[N, 4], + ransac_threshold_px: float, + min_inliers: int, + ) -> RansacResult + + @staticmethod + def compute_reprojection_residual( + correspondences: ndarray[I, 4], + K: ndarray[3, 3], + distortion: ndarray, + pose: SE3, + ) -> float # median residual in pixels +``` + +``` +RansacResult: + inlier_correspondences: ndarray[I, 4] + inlier_count: int + outlier_count: int + median_residual_px: float +``` + +## Implementation Notes + +- Backed by `cv2.findHomography(..., cv2.RANSAC)` for the 2D-2D case and `cv2.projectPoints(...)` for the post-pose residual. +- Stateless; pure function. + +## Caveats + +- The RANSAC threshold is a tunable; defaults are documented per-component (C3, C3.5, C4) in their specs. +- For 2D-3D RANSAC inside C4's `solvePnPRansac`, OpenCV does it internally — this helper is for the standalone reprojection-residual computation that lives outside the PnP call. diff --git a/_docs/02_document/common-helpers/08_helper_descriptor_normaliser.md b/_docs/02_document/common-helpers/08_helper_descriptor_normaliser.md new file mode 100644 index 0000000..47329e5 --- /dev/null +++ b/_docs/02_document/common-helpers/08_helper_descriptor_normaliser.md @@ -0,0 +1,33 @@ +# Common Helper — `DescriptorNormaliser` + +## Purpose + +L2-normalise descriptors so cosine similarity aligns with Euclidean distance — required because FAISS HNSW operates on Euclidean / inner-product spaces but the upstream backbones (UltraVPR, MegaLoc, MixVPR, etc.) emit raw cosine-similar embeddings. The same normalisation MUST be applied at both the **corpus** side (C10 during F1 provisioning) and the **query** side (C2 at runtime); centralising the helper guarantees they don't drift apart. + +## Used By + +- C2 — VPR (query side; per-frame embedding before FAISS lookup). +- C10 — Pre-flight Cache Provisioning (corpus side; per-tile embedding before FAISS index population). + +## Interface (sketch) + +``` +class DescriptorNormaliser: + @staticmethod + def l2_normalise(descriptor: ndarray[D]) -> ndarray[D] + @staticmethod + def l2_normalise_batch(descriptors: ndarray[N, D]) -> ndarray[N, D] + @staticmethod + def descriptor_metric() -> str # always "inner_product" — for FAISS index config +``` + +## Implementation Notes + +- Stateless; pure function. +- Backed by numpy / numpy-CUDA depending on the runtime; preserves dtype (fp16 in/out → fp16 out, fp32 in/out → fp32 out). +- The `descriptor_metric()` static return is the source of truth for the FAISS HNSW index distance metric — both C6's `DescriptorIndex.search_topk` and C10's index-build code consult it. + +## Caveats + +- Zero-norm vectors are returned as the zero vector (no division-by-zero); callers must filter or accept that such descriptors will match nothing. +- The choice of "inner product on L2-normalised" rather than "cosine" is FAISS-idiomatic — FAISS does not have a built-in cosine metric; cosine is achieved by L2-normalising and using inner product. diff --git a/_docs/02_document/components/01_c1_vio/description.md b/_docs/02_document/components/01_c1_vio/description.md new file mode 100644 index 0000000..69f1eaf --- /dev/null +++ b/_docs/02_document/components/01_c1_vio/description.md @@ -0,0 +1,136 @@ +# C1 — Visual / Visual-Inertial Odometry + +## 1. High-Level Overview + +**Purpose**: produce a per-frame relative pose SE(3) + 6×6 covariance + IMU bias estimate + feature-quality summary from the nav-camera frame and the FC IMU/attitude window, fusing visual and inertial cues without any external (satellite) reference. + +**Architectural Pattern**: Strategy — `VioStrategy` interface with three concrete implementations (Okvis2 production-default, VinsMono research-only, KltRansac mandatory simple-baseline), constructor-injected at the composition root (ADR-009), build-time gated by per-implementation CMake `BUILD_*` flags (ADR-002), runtime selection by config at startup (ADR-001), not hot-swappable mid-flight. + +**Upstream dependencies**: +- Camera ingest thread → `NavCameraFrame` (3 Hz nominal, drop-oldest queue). +- C8 FC adapter inbound side → `ImuWindow` (100–200 Hz, time-aligned to frame timestamp). +- Camera calibration artifact (loaded once at startup; passed in via constructor). + +**Downstream consumers**: +- C5 StateEstimator (consumes `VioOutput` for the iSAM2 `BetweenFactorPose3` + IMU bias prior). +- F8 Companion-reboot recovery (uses last `VioOutput` as warm-start hint when re-entering the per-frame loop). + +## 2. Internal Interfaces + +### Interface: `VioStrategy` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `process_frame` | `NavCameraFrame, ImuWindow, CameraCalibration` | `VioOutput` | No (called on the camera ingest hot path) | `VioInitializingError`, `VioDegradedError`, `VioFatalError` | +| `reset_to_warm_start` | `WarmStartPose` | `None` | No | `VioFatalError` | +| `health_snapshot` | `()` | `VioHealth` | No | — | + +**Input DTOs**: +``` +NavCameraFrame: + frame_id: uuid (required) — monotonic per-flight + capture_timestamp: monotonic_ns (required) — companion clock; FC clock cross-sync via C8 + pixels: ndarray[H=3648, W=5472, C=3, dtype=uint8] (required) — 3 Hz nominal + camera_id: string (required) — matches the loaded calibration artifact + +ImuWindow: + start_t_ns: monotonic_ns (required) + end_t_ns: monotonic_ns (required) — should bracket the frame timestamp + samples: list[ImuSample] (required) — accel + gyro at 100–200 Hz + +WarmStartPose: + body_T_world: SE3 (required) — initial pose hint, e.g. from F2 takeoff load (AC-5.1) + velocity_b: Vector3 (required, m/s) + bias: ImuBias (required) — accel + gyro bias seed +``` + +**Output DTOs**: +``` +VioOutput: + frame_id: uuid — echoes input + relative_pose_T: SE3 — body-frame motion since last keyframe + pose_covariance_6x6: Matrix6 — honest ESKF or factor-graph covariance per concrete strategy + imu_bias: ImuBias — current accel + gyro bias estimate + feature_quality: FeatureQuality — tracked/lost feature counts, mean parallax, MRE + emitted_at: monotonic_ns + +VioHealth: + state: enum {INIT, TRACKING, DEGRADED, LOST} + consecutive_lost: int + bias_norm: float — used by C5 quality_metadata + AC-NEW-8 spoof gate +``` + +## 3. External API Specification + +Not applicable — internal-only component, no HTTP/gRPC surface. + +## 4. Data Access Patterns + +Stateless w.r.t. persistent storage. Each strategy holds **in-memory** state only: +- Sliding window of N keyframes (concrete strategy decides N). +- IMU bias and velocity state. +- Feature track buffer. + +No database access, no cache layer beyond the in-process keyframe window. + +## 5. Implementation Details + +**Algorithmic Complexity**: per-frame cost is dominated by feature extraction + matching; `O(F)` in feature count for KltRansac, `O(F·log K)` for Okvis2 sliding-window optimisation across K keyframes (D-C5-3 sets K=10–20). + +**State Management**: per-instance in-memory (window of keyframes, IMU bias, velocity). The strategy lives for the duration of a flight; reset on `reset_to_warm_start` for F8 reboot recovery. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| OKVIS2 (C++) | upstream HEAD pinned per Plan-phase | Production-default tightly-coupled VIO; BSD-3-Clause | +| VINS-Mono (C++) | upstream HEAD pinned per Plan-phase | Research-only loosely-coupled VIO for IT-12 comparative study; behind `BUILD_VINS_MONO` | +| OpenCV | ≥ 4.12.0 (CVE-2025-53644 mitigation) | KLT pyramidal optical flow + RANSAC for the simple-baseline strategy | +| Eigen | matches OKVIS2 / GTSAM pin | Lie-algebra math for SE(3) + 6×6 covariance | +| pybind11 | matches OKVIS2 / VINS-Mono build | Python bindings for the C++ strategies | + +**Error Handling Strategy**: +- `VioInitializingError`: state = INIT, no `VioOutput` emitted, C5 falls back to FC IMU prior — no MAVLink emission. +- `VioDegradedError`: state = DEGRADED, `VioOutput` emitted with inflated covariance, C5 down-weights. +- `VioFatalError`: state = LOST after configurable consecutive frames; AC-5.2 fallback path triggered (FC IMU-only after 3 s). +- No retries inside `process_frame` — the caller is responsible for handling drop-oldest queue semantics on the hot path. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `ImuPreintegrator` | shared GTSAM `CombinedImuFactor` preintegration buffer | C1, C5 (both consume the same IMU window) | +| `SE3Utils` | SE(3) ↔ pose-matrix conversion, Lie-algebra exponential/logarithm | C1, C4, C5 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- Pure VIO drifts unbounded over time; AC-1.3 cumulative-drift bound (<100 m visual / <50 m IMU-fused between satellite anchors) is met *only* in cooperation with C2/C3/C4 anchors, not by C1 alone. +- Sharp turns with <5% frame overlap (RESTRICT-UAV-3) cause feature-track loss in all three strategies; F6 satellite re-localization is the recovery path. + +**Potential race conditions**: +- The camera ingest thread is the sole producer; C5 is the sole consumer. Concurrent calls to `process_frame` on a single strategy instance are forbidden — enforce in the composition root by binding one strategy instance to the camera ingest thread. + +**Performance bottlenecks**: +- Okvis2 sliding-window optimisation can spike to 80–120 ms on a thermally-throttled Jetson; D-CROSS-LATENCY-1 hybrid auto-degrades C4 covariance recovery (not C1) to free budget. + +## 8. Dependency Graph + +**Must be implemented after**: C7 (TRT/ONNX runtime is not on C1's path, but `ImuPreintegrator` shares GTSAM with C5 and is built alongside C5), C13 (FDR sink for VioHealth telemetry). + +**Can be implemented in parallel with**: C2, C3, C6 — independent code paths. + +**Blocks**: C5 (no fusion without `VioOutput`), C4 (no per-frame relative-pose prior), F3 / F5 / F6 (every per-frame flow consumes `VioOutput`). + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `VioFatalError` raised; AC-5.2 path imminent | `VIO LOST after 9 consecutive frames; strategy=okvis2` | +| WARN | `VioDegradedError`; covariance inflation > 2× steady-state | `VIO degraded: parallax=0.02, mre=4.1px, bias_norm=0.18` | +| INFO | Strategy init complete; warm-start applied; state transitions | `VIO ready: strategy=okvis2, calibration=adti20.unit-7` | +| DEBUG | Per-frame keyframe decision, feature-track count | `VIO frame=12345 tracked=187 new=42 keyframe=true` | + +**Log format**: structured JSON via the project's shared logger; no plaintext. + +**Log storage**: stdout (Tier-1) / journald (Tier-2 dev) / FDR via C13 (production). Per-frame DEBUG logs are never persisted to FDR — they go to stdout/journald only. diff --git a/_docs/02_document/components/01_c1_vio/tests.md b/_docs/02_document/components/01_c1_vio/tests.md new file mode 100644 index 0000000..702017b --- /dev/null +++ b/_docs/02_document/components/01_c1_vio/tests.md @@ -0,0 +1,176 @@ +# Test Specification — C1 Visual / Visual-Inertial Odometry + +This file is component-scoped. AC-level coverage is in the suite-level test docs (`_docs/02_document/tests/*.md`) and the canonical traceability map is `_docs/02_document/tests/traceability-matrix.md`. The tables below cite test IDs from those files; the `Component-Internal Tests` section adds C1-specific unit/contract tests that the suite-level scenarios do not cover. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs (suite-level + this file) | Coverage | +|-------|---------------------------------|-----------------------------------|----------| +| AC-1.3 | Cumulative drift between satellite-anchored fixes <100 m visual / <50 m IMU-fused | FT-P-02, **C1-IT-01** | Covered | +| AC-1.4 | Estimate reports 95% covariance + source label | FT-P-03, **C1-IT-02** | Covered | +| AC-2.1a | Frame-to-frame registration ≥95% on normal segments | FT-P-04, **C1-IT-03** | Covered | +| AC-2.2 (frame-to-frame portion) | MRE <1 px frame-to-frame | FT-P-05, **C1-IT-04** | Covered | +| AC-3.2 | Tolerate sharp turns; recovery via satellite re-loc | FT-P-07, FT-N-02 | Covered (C1 contributes track-loss detection) | +| AC-4.1 | E2E latency <400 ms p95 | NFT-PERF-01 (Tier-2), **C1-PT-01** | Covered | +| AC-5.1 | Init from FC EKF's last valid GPS + IMU-extrapolated | FT-P-11, **C1-IT-05** | Covered | +| AC-5.3 | On reboot, re-init from FC IMU-extrapolated pose | NFT-RES-02, **C1-IT-06** | Covered | + +--- + +## Component-Internal Tests + +### C1-IT-01: VioStrategy contract — `process_frame` honest covariance under degradation + +**Summary**: every concrete `VioStrategy` implementation (Okvis2, VinsMono, KltRansac) must produce a 6×6 covariance whose norm grows monotonically when feature tracking degrades. + +**Traces to**: AC-1.3, AC-1.4 + +**Description**: feed each strategy a synthetic 60 s nav-camera + IMU sequence with a controlled feature-loss event at t=30 s (50% feature drop). Assert that `pose_covariance_6x6` Frobenius norm before t=30 s is below the steady-state threshold and rises monotonically for ≥3 s after the event. No strategy may emit a tightened covariance during a degradation event (catches honest-covariance-violation regressions). + +**Input data**: `tests/fixtures/synthetic_vio/normal_then_feature_drop_60s/` (nav-cam frames at 3 Hz + IMU at 200 Hz; feature-loss event injected via image masking). + +**Expected result**: `||cov||_F` curve has a rising shoulder ≥1.5× steady-state norm within 3 s of the event for all three strategies; `VioHealth.state` transitions `TRACKING → DEGRADED` within the same window. + +**Max execution time**: 30 s per strategy on Tier-1. +**Dependencies**: helper `ImuPreintegrator` (shared with C5). + +--- + +### C1-IT-02: `VioOutput` schema invariants + +**Summary**: every `VioOutput` carries a 6×6 SPD covariance and a non-empty `feature_quality`. + +**Traces to**: AC-1.4 + +**Description**: drive each strategy through 100 frames of a synthetic loop; for each emitted `VioOutput`, assert (a) `pose_covariance_6x6` is symmetric and positive-definite, (b) `feature_quality.tracked + new ≥ 0`, (c) `frame_id` matches the input `NavCameraFrame.frame_id`. Any single violation fails the test. + +**Input data**: `tests/fixtures/synthetic_vio/loop_100f/`. + +**Expected result**: 100/100 frames pass all invariants per strategy. + +**Max execution time**: 10 s. + +--- + +### C1-IT-03: KltRansac mandatory simple-baseline registration ≥95% + +**Summary**: the simple-baseline strategy must hit AC-2.1a's 95% threshold on the Derkachi normal segment so the engine rule's mandatory baseline is met. + +**Traces to**: AC-2.1a (engine rule) + +**Description**: replay the Derkachi normal-segment fixture (the same 60 stills C8 fixture that FT-P-04 uses) through the KltRansac strategy only; count frames with `VioHealth.state == TRACKING` at emission. Pass if ≥95%. + +**Input data**: `tests/fixtures/flight_derkachi/normal_segment_60_stills/`. + +**Expected result**: tracked frame ratio ≥ 0.95. + +**Max execution time**: 60 s. + +--- + +### C1-IT-04: Frame-to-frame MRE bound + +**Summary**: each strategy's per-frame mean reprojection error stays under 1 px on normal segments. + +**Traces to**: AC-2.2 (frame-to-frame portion) + +**Description**: same Derkachi normal-segment fixture as C1-IT-03. Compute MRE per frame from the strategy's internal residual; assert MRE p95 < 1 px per AC-2.2. + +**Input data**: as above. + +**Expected result**: MRE p95 < 1 px for Okvis2 + KltRansac (production-default + simple-baseline). VinsMono is research-only and exempt from MRE bound (only IT-12 comparative-study coverage). + +**Max execution time**: 60 s. + +--- + +### C1-IT-05: Warm-start from `WarmStartPose` converges within configured budget + +**Summary**: `reset_to_warm_start` followed by 5 frames of input must converge to `state == TRACKING` for every strategy. + +**Traces to**: AC-5.1 + +**Description**: prepare a `WarmStartPose` derived from the FC EKF's last-valid-GPS fixture; call `reset_to_warm_start`, then push 5 frames of normal-segment input; assert health transitions `INIT → TRACKING` within 5 frames. + +**Input data**: `tests/fixtures/flight_derkachi/takeoff_warmstart/`. + +**Expected result**: TRACKING by frame 5 for all three strategies. + +**Max execution time**: 5 s. + +--- + +### C1-IT-06: F8 reboot recovery via warm-start hint + +**Summary**: simulate a mid-flight reboot — the strategy must re-init from the warm-start hint without crashing or producing covariance < pre-reboot. + +**Traces to**: AC-5.3 + +**Description**: run normal-segment input for 30 s; capture last `VioOutput`; reset the strategy with that pose as `WarmStartPose`; resume input; assert next 5 emitted `VioOutput` have `pose_covariance_6x6` Frobenius norm ≥ the pre-reboot value (no fake confidence after reboot). + +**Input data**: as C1-IT-03. + +**Expected result**: pass per assertion above for all three strategies. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C1-PT-01: per-frame latency budget on Tier-2 + +**Summary**: `process_frame` p95 latency on Jetson under nominal thermal. + +**Traces to**: AC-4.1 (component-level partition; suite-level NFT-PERF-01 owns the e2e budget). + +**Load scenario**: +- Single ingest thread, 3 Hz frame rate, 10 min replay of Derkachi normal segment. +- Concurrent C2 backbone forward pass running on the same Jetson (realistic load contention). + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `process_frame` latency p50 | ≤ 25 ms (Okvis2 production-default) | 60 ms | +| `process_frame` latency p95 | ≤ 80 ms | 120 ms | +| Throughput | ≥ 3 Hz sustained | < 2.5 Hz | + +**Resource limits**: +- CPU: ≤ 30% of one core (Okvis2 is multi-threaded internally; bound at 30% per ADR-002 budget partition). +- Memory: ≤ 1.5 GB resident. + +--- + +## Security Tests + +C1 has no externally-reachable surface (internal-only component); suite-level NFT-SEC-02 (no in-flight egress) and NFT-SEC-05 (DNS blackholing) cover the airborne process broadly. No C1-specific security tests. + +--- + +## Acceptance Tests + +C1 contributes to AC-1.3 / 1.4 / 2.1a / 5.1 / 5.3 via the suite-level FT scenarios cited in the traceability table. No additional C1-only acceptance tests are needed. + +--- + +## Test Data Management + +**Required test data**: + +| Data Set | Description | Source | Size | +|----------|-------------|--------|------| +| `synthetic_vio/normal_then_feature_drop_60s/` | 60 s nav-cam + IMU with controlled feature-loss event at t=30 s | generated by `scripts/gen_synthetic_vio.py` (deterministic seed) | ~50 MB | +| `synthetic_vio/loop_100f/` | 100-frame synthetic closed loop for invariant checks | generated, deterministic | ~30 MB | +| `flight_derkachi/normal_segment_60_stills/` | the project's canonical normal-segment fixture | curated subset of Derkachi raw drop | ~80 MB | +| `flight_derkachi/takeoff_warmstart/` | last-valid-GPS + IMU window from FC EKF for warm-start tests | recorded once, replayed | ~5 MB | + +**Setup procedure**: +1. Run `scripts/gen_synthetic_vio.py` once per fixture to populate `tests/fixtures/synthetic_vio/`. +2. Mount `tests/fixtures/flight_derkachi/` from the project's data archive (read-only). + +**Teardown procedure**: +1. Synthetic fixtures persist between runs (deterministic; no per-run mutation). +2. The Derkachi fixture is read-only; nothing to clean up. + +**Data isolation strategy**: every test runs in its own temp directory under `tests/tmp/c1//`; per-strategy state is constructed fresh in each test (no shared state across tests). diff --git a/_docs/02_document/components/02_c2_vpr/description.md b/_docs/02_document/components/02_c2_vpr/description.md new file mode 100644 index 0000000..ce52d28 --- /dev/null +++ b/_docs/02_document/components/02_c2_vpr/description.md @@ -0,0 +1,138 @@ +# C2 — Visual Place Recognition + +## 1. High-Level Overview + +**Purpose**: given the current `NavCameraFrame`, retrieve the top-K=10 candidate satellite tiles from the pre-cached corpus by descriptor similarity. C2 owns the *retrieval* step; C2.5 narrows K=10 → N=3 via inlier-based re-rank. + +**Architectural Pattern**: Strategy — `VprStrategy` interface; concrete implementations (UltraVPR primary, MegaLoc secondary, MixVPR / SelaVPR / EigenPlaces / NetVLAD / SALAD additional candidates) selected at startup by config (ADR-001); build-time gated per-implementation by `BUILD_*` flags (ADR-002); composition-root wired (ADR-009). + +**Upstream dependencies**: +- Camera ingest thread → `NavCameraFrame` (parallel fan-out with C1; same frame, distinct queue depth). +- C7 InferenceRuntime → backbone forward pass (TRT/ONNX/PyTorch per active runtime). +- C6 DescriptorIndex → FAISS HNSW lookup over pre-cached tile descriptors. +- Camera calibration artifact — for backbone input preprocessing (resize/crop/normalise). + +**Downstream consumers**: +- C2.5 ReRanker (consumes `VprResult`). + +## 2. Internal Interfaces + +### Interface: `VprStrategy` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `embed_query` | `NavCameraFrame, CameraCalibration` | `VprQuery` | No | `VprBackboneError` | +| `retrieve_topk` | `VprQuery, k: int` | `VprResult` | No | `IndexUnavailableError`, `VprBackboneError` | +| `descriptor_dim` | `()` | `int` | No | — | + +**Input DTOs**: +``` +NavCameraFrame: see C1 spec — same DTO + +VprQuery: + frame_id: uuid (required) + embedding: ndarray[D, dtype=float16|float32] (required) — D depends on backbone + produced_at: monotonic_ns +``` + +**Output DTOs**: +``` +VprResult: + frame_id: uuid + candidates: list[VprCandidate] (length = k, ranked by descriptor distance ascending) + retrieved_at: monotonic_ns + backbone_label: string — for FDR provenance + +VprCandidate: + tile_id: composite (zoomLevel, lat, lon) + descriptor_distance: float — backbone-specific metric (cosine for L2-normalised embeddings) + descriptor_dim: int +``` + +## 3. External API Specification + +Not applicable — internal-only component. + +## 4. Data Access Patterns + +### Queries + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| FAISS HNSW top-K=10 search | 3 Hz (per nav frame) | Yes | Yes — pre-built HNSW (C6) | + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| Backbone weights | TRT engine on disk + GPU resident | flight lifetime | Manifest content-hash gate (D-C10-3) at takeoff | +| FAISS HNSW index | mmap (C6 owns the file) | flight lifetime | Same as above | + +### Storage Estimates + +C2 itself stores no persistent data; it consumes C6's descriptor index. Sizing belongs in C6. + +### Data Management + +C2 is read-only against C6 during F3/F4/F6. Pre-flight, F1 triggers C10 (after C11 `TileDownloader` has populated C6) to call `embed_query` on every staged tile to populate the descriptor matrix consumed by C6. + +## 5. Implementation Details + +**Algorithmic Complexity**: HNSW search is `O(log N)` in corpus size for k=10; backbone forward pass is `O(1)` per frame (GPU-bound). + +**State Management**: stateless per-frame; the only persistent state is the loaded backbone weights and the FAISS index pointer (held by C6 and passed in via constructor). + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| FAISS (Python + C++) | upstream HEAD pinned per Plan-phase | HNSW retrieval; consumed via C6 | +| TensorRT | 10.3 (JetPack 6.2 pin) | Primary inference backend; consumed via C7 | +| ONNX Runtime + TRT EP | matches C7 | Fallback backend | +| PyTorch | matches simple-baseline track | FP16 baseline (NetVLAD / MixVPR mandatory) | +| UltraVPR (research code drop) | upstream HEAD pinned per Plan-phase | Documentary Lead PRIMARY backbone | +| MegaLoc, MixVPR, SelaVPR, EigenPlaces, NetVLAD | upstream HEAD pinned per Plan-phase | Secondary + mandatory simple-baselines | + +**Error Handling Strategy**: +- `VprBackboneError`: backbone forward pass failed (CUDA OOM, TRT engine deserialize mismatch). C2 emits no `VprResult`; C5 falls back to VIO-only with provenance label `visual_propagated` (AC-1.4). +- `IndexUnavailableError`: FAISS index handle invalid (e.g., post-F8 reboot before warm-up). Same fallback as above; F8 recovery flow re-mmaps the index. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `BackbonePreprocessor` | resize / crop / normalise per backbone's input contract | C2 only — keep inside the component, not a shared helper | +| `DescriptorNormaliser` | L2-normalise descriptors so cosine similarity aligns with Euclidean | C2 (query side), C10 (corpus side at cache artifact build) | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- VPR is sensitive to scene change between cache build and flight time — AC-NEW-6 freshness gating is the project-level mitigation, not a C2 concern. +- Backbone choice is constrained by ADR-002: only the linked-in implementations are selectable at runtime. + +**Potential race conditions**: +- Concurrent `embed_query` calls on a single strategy instance can race on the GPU stream. Bind one strategy instance to one ingest thread — composition root enforces. + +**Performance bottlenecks**: +- Backbone forward pass is the dominant cost (~30–80 ms on Jetson per backbone). FAISS HNSW search is sub-millisecond for 100k-tile corpora. +- D-CROSS-LATENCY-1 hybrid does not change C2 behaviour — C2's budget is fixed; the auto-degrade happens at C4. + +## 8. Dependency Graph + +**Must be implemented after**: C6 (descriptor index), C7 (inference runtime), C10 (descriptor population at cache artifact build). + +**Can be implemented in parallel with**: C1, C8 — independent paths. + +**Blocks**: C2.5 (no candidates without `VprResult`), F3 / F6. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `VprBackboneError` or `IndexUnavailableError` | `VPR backbone OOM: backbone=ultravpr, frame=12345` | +| WARN | top-1 distance exceeds drift threshold (potential false-positive retrieval) | `VPR top-1 distance 0.42 above warn threshold 0.30; backbone=ultravpr` | +| INFO | Strategy ready; backbone loaded | `VPR ready: backbone=ultravpr, dim=512, corpus_size=87654` | +| DEBUG | Per-frame top-K distances | `VPR frame=12345 top10_distances=[0.12, 0.14, ...]` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN only). diff --git a/_docs/02_document/components/02_c2_vpr/tests.md b/_docs/02_document/components/02_c2_vpr/tests.md new file mode 100644 index 0000000..538b786 --- /dev/null +++ b/_docs/02_document/components/02_c2_vpr/tests.md @@ -0,0 +1,144 @@ +# Test Specification — C2 Visual Place Recognition + +Component-scoped. Suite-level coverage is in `_docs/02_document/tests/*.md`; canonical traceability is `tests/traceability-matrix.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-2.1b | Satellite-anchor registration meets AC-1.1/1.2/2.2/8.2/8.6 | FT-P-05, FT-P-19, **C2-IT-01** | Covered | +| AC-2.2 (cross-domain portion) | MRE <2.5 px cross-domain | FT-P-06, **C2-IT-02** (recall floor only — MRE owned by C3) | Covered | +| AC-4.1 | E2E latency <400 ms p95 | NFT-PERF-01, **C2-PT-01** | Covered | +| AC-NEW-7 | Cache poisoning safety budget | NFT-SEC-01 (onboard-side), **C2-IT-03** | Covered (relaxed) | +| AC-8.6 (scale-ratio portion) | Scale-ratio satellite relocalization | FT-P-19, **C2-IT-04** | Covered | + +--- + +## Component-Internal Tests + +### C2-IT-01: top-K=10 recall at p=10 on Derkachi + +**Summary**: the primary backbone (UltraVPR) achieves recall@10 ≥ 0.95 on the Derkachi normal segment against the pre-built corpus. + +**Traces to**: AC-2.1b + +**Description**: for each query frame in `flight_derkachi/normal_segment_60_stills/`, embed via UltraVPR; query the FAISS HNSW index; assert that the ground-truth tile (per recorded GPS + sector classification) is within the top-10 candidates ≥95% of frames. + +**Input data**: `flight_derkachi/normal_segment_60_stills/` + the Derkachi corpus FAISS index built by C10 (consumed read-only here). + +**Expected result**: recall@10 ≥ 0.95 for UltraVPR; recall@10 ≥ 0.85 for the mandatory simple-baseline NetVLAD (engine rule check). + +**Max execution time**: 90 s. + +--- + +### C2-IT-02: VprResult schema invariants + +**Summary**: every `VprResult` carries `len(candidates) == k`, monotonically-non-decreasing `descriptor_distance`, and a non-empty `backbone_label`. + +**Traces to**: AC-2.2 (downstream-coverage prerequisite) + +**Description**: 100 frames through `retrieve_topk(k=10)`; assert (a) length, (b) sorted-ascending distance, (c) `backbone_label` non-empty. + +**Input data**: `tests/fixtures/synthetic_vpr/diverse_100f/`. + +**Expected result**: 100/100 results pass all invariants. + +**Max execution time**: 10 s. + +--- + +### C2-IT-03: cache-poisoning seed rejection at retrieval + +**Summary**: when the corpus contains a poisoned tile injected during NFT-SEC-01 setup, the top-1 distance to that tile is bounded so downstream RANSAC + voting can reject it within the AC-NEW-7 relaxed budget. + +**Traces to**: AC-NEW-7 (component-level partition) + +**Description**: load the NFT-SEC-01 setup corpus (3-flight cumulative dataset with deflated covariance × 1.5–3); query each Derkachi frame; record the top-1 candidate for each. Pass if the poisoned tile is top-1 in fewer than the relaxed-CI threshold (the relaxation per AC-text 2026-05-09). + +**Input data**: NFT-SEC-01 setup corpus. + +**Expected result**: poisoned-tile top-1 rate within AC-NEW-7 relaxed CI. + +**Max execution time**: 5 min. + +--- + +### C2-IT-04: scale-ratio invariance on satellite re-loc + +**Summary**: when the nav-camera scale changes (e.g., altitude shift), VPR top-K still surfaces the correct tile. + +**Traces to**: AC-8.6 (scale-ratio half) + +**Description**: synthetically scale the Derkachi normal-segment frames by ±20% (mimicking altitude variation); assert recall@10 stays ≥ 0.85 across the scale sweep. + +**Input data**: `flight_derkachi/scaled_+20%/` and `flight_derkachi/scaled_-20%/` (generated via deterministic resize). + +**Expected result**: recall@10 ≥ 0.85 at both scale extremes for UltraVPR. + +**Max execution time**: 90 s. + +--- + +## Performance Tests + +### C2-PT-01: backbone forward + HNSW lookup budget on Tier-2 + +**Traces to**: AC-4.1 + +**Load scenario**: 3 Hz frame rate, 10 min replay; corpus size 87 654 tiles (Derkachi area at 0.5 m/px). + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `embed_query` p95 | ≤ 60 ms (UltraVPR / FP16) | 100 ms | +| `retrieve_topk(k=10)` p95 | ≤ 2 ms (HNSW) | 10 ms | +| Combined p95 | ≤ 65 ms | 110 ms | + +**Resource limits**: +- GPU memory: ≤ 600 MB resident for backbone weights. +- System memory: ≤ 200 MB for the mmap'd FAISS index handle. + +--- + +## Security Tests + +### C2-ST-01: index-handle invalidation safety + +**Summary**: after C10 rebuilds the FAISS index (post-takeoff is FORBIDDEN, but unit-level safety check), the previous handle held by C2 must not silently return stale results. + +**Traces to**: defensive — no AC trace; backstops a code-injection / config-drift mode that AC-NEW-7 already covers at the suite level. + +**Test procedure**: +1. Build a tiny 100-tile FAISS index; mmap it through C2. +2. Replace the underlying file (simulating an out-of-band rebuild, which is FORBIDDEN at flight time per D-C10-3 but defended-in-depth here). +3. Call `retrieve_topk` and assert C2 raises `IndexUnavailableError` rather than returning stale candidates. + +**Pass criteria**: `IndexUnavailableError` raised; no candidates returned. +**Fail criteria**: any candidate returned. + +--- + +## Acceptance Tests + +C2 contributes to AC-2.1b / AC-8.6 via the suite-level FT scenarios. No additional C2-only acceptance tests. + +--- + +## Test Data Management + +| Data Set | Description | Source | Size | +|----------|-------------|--------|------| +| `synthetic_vpr/diverse_100f/` | 100 diverse synthetic frames for invariant checks | generated, deterministic | ~50 MB | +| `flight_derkachi/normal_segment_60_stills/` | shared with C1 | curated | shared | +| `flight_derkachi/scaled_±20%/` | scale-ratio sweep | generated from above | ~40 MB | +| Derkachi FAISS corpus | C10's output, consumed read-only | C10 build artifact | ~200 MB | + +**Setup procedure**: +1. C10 must have built the Derkachi FAISS index (this is a Step-5-test-side prereq; in the test runner, the corpus is staged from `tests/fixtures/cache_artifacts/`). +2. Synthetic + scaled fixtures generated by deterministic scripts. + +**Teardown**: corpus is read-only; nothing to clean up. + +**Data isolation**: each test gets its own `tests/tmp/c2//`. diff --git a/_docs/02_document/components/03_c2_5_rerank/description.md b/_docs/02_document/components/03_c2_5_rerank/description.md new file mode 100644 index 0000000..5627a96 --- /dev/null +++ b/_docs/02_document/components/03_c2_5_rerank/description.md @@ -0,0 +1,111 @@ +# C2.5 — Inlier-based Re-rank + +## 1. High-Level Overview + +**Purpose**: re-rank C2's top-K=10 VPR candidates down to top-N=3 by single-pair LightGlue inlier count, producing a higher-precision input for the cross-domain matcher (C3). The re-rank step is the architectural boundary between cheap descriptor retrieval (C2) and expensive cross-domain matching (C3) — it pays a small extra cost so C3 only operates on the most promising candidates. + +**Architectural Pattern**: Strategy (single concrete implementation today: `InlierCountReRanker`). Future re-rank algorithms can be added as additional `ReRankStrategy` implementations behind the same interface. + +**Upstream dependencies**: +- C2 → `VprResult` (top-K=10 candidates). +- Shared `LightGlueRuntime` helper (used in single-pair mode for inlier counting; the same matcher object is shared with C3 — owned by the helper, not by C3, so neither component depends on the other at build time). +- C6 TileStore → fetch tile pixels for each candidate (cheap, in-memory page-cache hit during a flight). +- Camera calibration artifact — for nav-frame preprocessing. + +**Downstream consumers**: +- C3 CrossDomainMatcher (consumes `RerankResult`). + +## 2. Internal Interfaces + +### Interface: `ReRankStrategy` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `rerank` | `NavCameraFrame, VprResult, n: int` | `RerankResult` | No | `RerankBackboneError`, `TileFetchError` | + +**Input DTOs**: +``` +NavCameraFrame: see C1 +VprResult: see C2 +``` + +**Output DTOs**: +``` +RerankResult: + frame_id: uuid + candidates: list[RerankCandidate] (length = n=3, ranked by inlier_count descending) + reranked_at: monotonic_ns + +RerankCandidate: + tile_id: composite (zoomLevel, lat, lon) + inlier_count: int — single-pair LightGlue inliers + descriptor_distance: float — carried forward from C2 for FDR provenance + tile_pixels_handle: Tile pixel reference (do not copy — page-cache hit) +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| Tile pixel fetch from C6 (10 tiles per frame) | 3 Hz × 10 = 30 Hz | Yes | tile filesystem already mmap-backed in C6 | + +No caching layer beyond C6's mmap. The same tile may be fetched repeatedly across frames; OS page cache absorbs that cost. + +## 5. Implementation Details + +**Algorithmic Complexity**: `O(K)` LightGlue forward passes per frame (K=10), each `O(M_tile · M_query)` in feature counts. The whole step is GPU-bound on the same engine that C3 uses — hence the shared LightGlue runtime. + +**State Management**: stateless per-frame. Holds a reference to the shared LightGlue object owned by C3. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| LightGlue (Python) | upstream HEAD pinned per Plan-phase | Single-pair matching for inlier count | +| TensorRT | matches C7 | LightGlue inference engine reuse | + +**Error Handling Strategy**: +- `RerankBackboneError`: LightGlue forward pass failed on one or more candidates. The candidate is dropped from the rerank set; if fewer than N=3 candidates survive, C2.5 returns whatever it has and C3 proceeds with reduced N. +- `TileFetchError`: C6 read failure for a candidate tile. Same drop-and-continue behaviour as above. +- Hard failure (zero candidates left after rerank): emit no `RerankResult`; C5 falls back to VIO-only with provenance label `visual_propagated`. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `LightGlueRuntime` | shared LightGlue inference handle (one engine, many call sites) | C2.5, C3 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- The re-rank correctness depends on LightGlue inlier-count being a meaningful proxy for cross-domain match quality at single-pair resolution. If a backbone in C2 returns visually-similar-but-geographically-wrong candidates, C2.5's inlier count can still rank them above the true match — AC-NEW-7 cache-poisoning safety budget catches this downstream. + +**Potential race conditions**: +- Shared LightGlue runtime is the same object as C3 uses. Serial access from a single ingest thread; concurrent calls forbidden. + +**Performance bottlenecks**: +- 10 LightGlue passes per frame is non-trivial; budget allocation lives in `tests/performance-tests.md` NFT-PERF-01 partition. + +## 8. Dependency Graph + +**Must be implemented after**: C2 (input), shared `LightGlueRuntime` helper (which both C2.5 and C3 consume), C6 (tile pixels), C7 (inference runtime). C2.5 does **not** depend on C3 at build time — they are sibling consumers of the helper, and the data flow is C2.5 → C3 (not the other way). + +**Can be implemented in parallel with**: C1 (independent path). + +**Blocks**: C3 (no `RerankResult`, C3 has no input), F3 / F6. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | Zero candidates surviving rerank | `Re-rank produced 0 candidates; frame=12345; falling back to visual_propagated` | +| WARN | /`. diff --git a/_docs/02_document/components/04_c3_matcher/description.md b/_docs/02_document/components/04_c3_matcher/description.md new file mode 100644 index 0000000..7d4bda2 --- /dev/null +++ b/_docs/02_document/components/04_c3_matcher/description.md @@ -0,0 +1,127 @@ +# C3 — Cross-domain Matcher + +## 1. High-Level Overview + +**Purpose**: produce 2D-3D correspondences between the current `NavCameraFrame` and the top-N=3 satellite tiles from C2.5, with RANSAC-filtered inliers and reprojection residual statistics. C3 is the **cross-domain** step (nav-camera ↔ satellite-imagery domain gap) and is the dominant compute cost in F3. + +**Architectural Pattern**: Strategy — `CrossDomainMatcher` interface, with concrete implementations DISK+LightGlue (D-C3-1 = (a) primary), ALIKED+LightGlue (secondary), XFeat (alternate). Selection at startup by config (ADR-001); build-time gating by `BUILD_*` flags (ADR-002); composition-root wired (ADR-009). + +**Upstream dependencies**: +- C2.5 → `RerankResult` (top-N=3 candidates). +- C7 InferenceRuntime → backbone forward pass. +- Camera calibration artifact — for nav-frame preprocessing. +- C6 TileStore — for tile pixels (handle carried in `RerankCandidate`). + +**Downstream consumers**: +- C3.5 ConditionalRefiner (consumes `MatchResult`; passthrough or AdHoP refinement). + +## 2. Internal Interfaces + +### Interface: `CrossDomainMatcher` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `match` | `NavCameraFrame, RerankResult, CameraCalibration` | `MatchResult` | No | `MatcherBackboneError`, `InsufficientInliersError` | +| `health_snapshot` | `()` | `MatcherHealth` | No | — | + +**Input DTOs**: +``` +NavCameraFrame: see C1 +RerankResult: see C2.5 +CameraCalibration: see C5 +``` + +**Output DTOs**: +``` +MatchResult: + frame_id: uuid + per_candidate: list[CandidateMatchSet] (length up to N=3, drop on failure) + best_candidate_idx: int — argmax(inlier_count) among per_candidate + reprojection_residual_px: float — best candidate's median residual + matched_at: monotonic_ns + matcher_label: string — for FDR provenance + +CandidateMatchSet: + tile_id: composite (zoomLevel, lat, lon) + inlier_count: int + inlier_correspondences: ndarray[I, 4, dtype=float32] — (px_query, py_query, px_tile, py_tile) + ransac_outlier_count: int + per_candidate_residual_px: float + +MatcherHealth: + consecutive_low_inlier: int + mean_inliers_60s: float +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| Tile pixel access (3 tiles per frame) | 3 Hz × 3 = 9 Hz | Yes | C6 mmap | + +No additional caching beyond C6. + +## 5. Implementation Details + +**Algorithmic Complexity**: `O(N · F · F)` for the matching pass per backbone (N=3 candidates, F features per image), plus RANSAC `O(I · trials)` on inliers. Dominant cost is the backbone forward pass. + +**State Management**: stateless per-frame. Holds the shared LightGlue / DISK runtime handle. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| DISK (Python) | upstream HEAD pinned per Plan-phase | Primary feature extractor (D-C3-1 = (a)) | +| LightGlue (Python) | upstream HEAD pinned per Plan-phase | Primary matcher; replaces SuperPoint+SuperGlue (Magic Leap noncommercial) | +| ALIKED (Python) | upstream HEAD pinned per Plan-phase | Secondary feature extractor | +| XFeat (Python) | upstream HEAD pinned per Plan-phase | Alternate (lightweight) feature+matcher | +| OpenCV | ≥ 4.12.0 | RANSAC + reprojection residual computation | +| TensorRT | matches C7 | Backbone engine compilation/runtime | + +**Error Handling Strategy**: +- `MatcherBackboneError`: backbone forward pass failed on a candidate. Candidate dropped from `per_candidate`; if all N=3 candidates fail, emit `InsufficientInliersError`. +- `InsufficientInliersError`: RANSAC inlier count below configurable threshold on every candidate. C5 falls back to VIO-only with provenance label `visual_propagated`. F6 satellite re-localization may trigger if the condition persists. +- Catastrophic backbone failure (engine unloadable): treated as `MatcherBackboneError` for every frame until F8 reboot recovery. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `LightGlueRuntime` | shared LightGlue inference handle | C2.5, C3 | +| `RansacFilter` | RANSAC + reprojection residual computation thin wrapper | C3, C3.5, C4 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- The cross-domain gap (nav-camera vs satellite tile) is the hardest step in the pipeline. Backbone choice depends on the deployment camera's spectral and resolution characteristics; the current default (DISK+LightGlue) is locked per Mode B Fact #110 / D-C3-1 = (a) pending IT-12 verdict. +- D-C2-12 (DINOv2-feature-based matcher) is a carryforward research item that may displace DISK in a future cycle. + +**Potential race conditions**: +- Shared LightGlue runtime with C2.5; serial access from one ingest thread. + +**Performance bottlenecks**: +- C3 dominates the F3 latency budget. The D-CROSS-LATENCY-1 hybrid does NOT change C3 (K=N=3 stays); it changes C4 covariance recovery. + +## 8. Dependency Graph + +**Must be implemented after**: C2.5 (input), C7 (inference runtime). + +**Can be implemented in parallel with**: C1, C6 — independent paths. + +**Blocks**: C3.5 / C4 / C5, F3 / F6. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `InsufficientInliersError`; all N=3 candidates failed | `C3 zero-inlier on all candidates; frame=12345; backbone=disk_lightglue` | +| WARN | reprojection residual above threshold (will trigger AdHoP at C3.5) | `C3 residual=4.2px > threshold=2.5px; frame=12345; will refine` | +| INFO | Strategy ready | `C3 ready: matcher=disk_lightglue` | +| DEBUG | per-candidate inlier + residual | `C3 frame=12345 candidates=[(412,1.1px),(287,1.4px),(198,2.0px)]` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN only). diff --git a/_docs/02_document/components/04_c3_matcher/tests.md b/_docs/02_document/components/04_c3_matcher/tests.md new file mode 100644 index 0000000..a8d5dfe --- /dev/null +++ b/_docs/02_document/components/04_c3_matcher/tests.md @@ -0,0 +1,144 @@ +# Test Specification — C3 Cross-Domain Matcher + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-1.1 | Frame-center GPS within 50 m for ≥80% of normal-flight photos | FT-P-01, **C3-IT-01** (inlier-budget partition) | Covered | +| AC-2.1b | Satellite-anchor registration | FT-P-05, **C3-IT-02** | Covered | +| AC-2.2 (cross-domain portion) | MRE <2.5 px cross-domain | FT-P-06, **C3-IT-03** | Covered | +| AC-3.1 | Tolerate up to 350 m outliers, tilt ±20° | FT-N-01, **C3-IT-04** | Covered | +| AC-4.1 | E2E latency <400 ms p95 | NFT-PERF-01, **C3-PT-01** | Covered | + +--- + +## Component-Internal Tests + +### C3-IT-01: per-candidate inlier-count floor on Derkachi + +**Summary**: on the Derkachi normal segment, the best candidate's RANSAC inlier count is at least the configured floor (default ≥ 80 inliers per frame). + +**Traces to**: AC-1.1 (component-level partition feeding AC-1.1's accuracy budget) + +**Description**: for the Derkachi normal-segment fixture, run C3 with the production-default DISK+LightGlue backbone; record `MatchResult.per_candidate[best_candidate_idx].inlier_count`; assert p5 ≥ 80 (i.e., ≥95% of frames clear the floor). + +**Input data**: `flight_derkachi/normal_segment_60_stills/` + C10-built tile descriptors (read-only). + +**Expected result**: p5 inlier count ≥ 80 for DISK+LightGlue. + +**Max execution time**: 4 min on Tier-1 (CPU fallback) / 90 s on Tier-2. + +--- + +### C3-IT-02: best-candidate selection determinism + +**Summary**: `best_candidate_idx == argmax(inlier_count)` for every emitted `MatchResult`. + +**Traces to**: AC-2.1b + +**Description**: 100 frames through `match`; assert (a) `best_candidate_idx` equals the index of the largest `inlier_count` in `per_candidate`, (b) ties are broken deterministically (lowest tile_id wins; check against a known-tie synthetic fixture). + +**Input data**: `synthetic_matcher/known_tie_10f/` (synthetic frames where two candidates have identical inlier counts). + +**Expected result**: deterministic tie-breaking; no `best_candidate_idx` mismatch in 100/100 frames. + +**Max execution time**: 60 s. + +--- + +### C3-IT-03: cross-domain MRE bound + +**Summary**: median per-frame reprojection residual stays under 2.5 px for the production-default matcher on the Derkachi normal segment. + +**Traces to**: AC-2.2 + +**Description**: same Derkachi fixture; record `MatchResult.reprojection_residual_px`; assert p95 < 2.5 px. + +**Input data**: as C3-IT-01. + +**Expected result**: p95 < 2.5 px for DISK+LightGlue. ALIKED+LightGlue (secondary) and XFeat (alternate) tested on a smoke subset only — comparative-study verdict belongs to IT-12. + +**Max execution time**: 4 min. + +--- + +### C3-IT-04: tilt + outlier robustness + +**Summary**: under ±20° tilt and synthetic 350 m outliers, the matcher still produces a usable inlier set (≥40 inliers). + +**Traces to**: AC-3.1 + +**Description**: synthetically tilt the Derkachi frames by {−20°, −10°, 0°, +10°, +20°}; inject 350 m position outliers into the candidate tile metadata for 5% of frames; assert C3 emits a `MatchResult` with `best_candidate.inlier_count ≥ 40` for ≥90% of frames in each tilt bucket. + +**Input data**: `flight_derkachi/tilted_±20°/` (deterministic synthetic tilt). + +**Expected result**: per-bucket inlier-count p10 ≥ 40 for DISK+LightGlue. + +**Max execution time**: 6 min. + +--- + +### C3-IT-05: `InsufficientInliersError` propagation + +**Summary**: when all N=3 candidates fall below the inlier floor, C3 raises `InsufficientInliersError` and emits no `MatchResult`. + +**Traces to**: AC-3.5 (defensive — keeps the spoof-fallback path clean) + +**Description**: synthetic frames with deliberately mismatched candidate tiles (cross-region pulls); assert `match` raises `InsufficientInliersError` for every frame and the downstream consumer (C3.5 / C4) receives no `MatchResult`. + +**Input data**: `synthetic_matcher/cross_region_mismatch_20f/`. + +**Expected result**: 20/20 frames raise the error. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C3-PT-01: per-frame match latency on Tier-2 (dominant cost) + +**Traces to**: AC-4.1 (C3 owns the largest single partition of the budget) + +**Load scenario**: 3 Hz, N=3 candidates, 10 min replay; concurrent C2 backbone + C5 iSAM2 update on the same Jetson. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `match` p95 | ≤ 180 ms (DISK+LightGlue) | 280 ms | +| Per-candidate p95 | ≤ 60 ms | 95 ms | +| Throughput | ≥ 3 Hz sustained | < 2.5 Hz | + +**Resource limits**: +- GPU memory: ≤ 800 MB for backbone + matcher engines combined. + +--- + +## Security Tests + +C3 has no externally-reachable surface; defensive coverage at the cache-poisoning level (NFT-SEC-01) and via shared-runtime invariants (C2.5-IT-03). + +--- + +## Acceptance Tests + +Covered transitively via FT-P-01, FT-P-05, FT-P-06. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| `flight_derkachi/normal_segment_60_stills/` | shared | shared | +| `flight_derkachi/tilted_±20°/` | generated | ~150 MB | +| `synthetic_matcher/known_tie_10f/` | generated | ~5 MB | +| `synthetic_matcher/cross_region_mismatch_20f/` | generated | ~10 MB | +| C10-built tile descriptors | C10 artifact | shared | + +**Setup**: C10 must have built tile descriptors; matchers' TRT engines must be compiled (consumes ~5 min on Tier-2 first run; cached after). +**Teardown**: read-only. +**Data isolation**: per-test temp dirs. diff --git a/_docs/02_document/components/05_c3_5_adhop/description.md b/_docs/02_document/components/05_c3_5_adhop/description.md new file mode 100644 index 0000000..6609e76 --- /dev/null +++ b/_docs/02_document/components/05_c3_5_adhop/description.md @@ -0,0 +1,107 @@ +# C3.5 — AdHoP-conditional Refinement + +## 1. High-Level Overview + +**Purpose**: conditionally refine `MatchResult` via OrthoLoC AdHoP (method-agnostic perspective preconditioning) when the initial reprojection residual exceeds a configurable threshold; bypass otherwise. The conditional invocation preserves the AC-4.1 latency budget on the steady-state path while keeping the refinement option for hard frames. + +**Architectural Pattern**: Strategy with two concrete implementations: `AdHoPRefiner` (real refinement) and `PassthroughRefiner` (no-op for the non-conditional baseline / smoke tests). Selection at startup by config (ADR-001); both implementations linked into the deployment binary by default (refinement is conditionally invoked at runtime, not gated at build time). + +**Upstream dependencies**: +- C3 → `MatchResult`. +- C7 InferenceRuntime — AdHoP backbone forward pass when invoked. + +**Downstream consumers**: +- C4 PoseEstimator (consumes the possibly-refined `MatchResult`). + +## 2. Internal Interfaces + +### Interface: `ConditionalRefiner` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `refine_if_needed` | `NavCameraFrame, MatchResult, residual_threshold_px: float` | `MatchResult` (possibly enriched with refined inliers) | No | `RefinerBackboneError` | +| `was_invoked` | `()` | `bool` — last-call flag for FDR provenance | No | — | + +**Input DTOs**: +``` +NavCameraFrame: see C1 +MatchResult: see C3 +``` + +**Output DTOs**: +``` +MatchResult (refined): + Same shape as C3's MatchResult, with the following enrichments when refinement was invoked: + per_candidate[i].inlier_correspondences: refined coordinates from AdHoP perspective preconditioning + per_candidate[i].per_candidate_residual_px: post-refinement residual + refinement_label: "adhop" | "passthrough" + refinement_added_latency_ms: float — for FDR latency partition +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| Nav-frame pixel access (1 frame per invocation) | conditional, <3 Hz | When invoked | already in memory from C1/C2 path | + +No additional caching. + +## 5. Implementation Details + +**Algorithmic Complexity**: AdHoP perspective preconditioning is `O(F)` in feature count plus one extra backbone forward pass. Steady-state cost is zero (passthrough); worst-case adds ~30–90 ms per invocation on the Jetson. + +**State Management**: stateless per-frame. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| OrthoLoC AdHoP (research code drop) | upstream HEAD pinned per Plan-phase | Conditional refinement | +| OpenCV | ≥ 4.12.0 | Reprojection residual computation, perspective transforms | +| TensorRT | matches C7 | AdHoP backbone engine when invoked | + +**Error Handling Strategy**: +- `RefinerBackboneError`: AdHoP backbone failure. Fall through to passthrough — emit C3's original `MatchResult` unchanged; downstream pose estimation may fail subsequent quality gates and trigger F6 satellite re-localization. +- The conditional gate is a configuration parameter (`adhop.residual_threshold_px`); no runtime decision logic outside that comparison. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `RansacFilter` | shared with C3 | C3, C3.5, C4 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- The threshold is a tunable; setting it too low triggers AdHoP every frame and breaks the AC-4.1 budget; too high lets bad matches through to C4. Calibration via NFT-PERF-01 invocation-rate measurement during planning's NFT validation cycle. +- D-CROSS-LATENCY-1 hybrid does not directly change C3.5's behaviour; under thermal throttle, the elevated frame-to-frame latency may push the threshold to be raised at the operator-tooling level pre-flight (C12 concern). + +**Potential race conditions**: +- Stateless; concurrent calls are safe in principle but the hot path is single-threaded by design. + +**Performance bottlenecks**: +- AdHoP invocation is the variable cost in the F3 budget. NFT-PERF-01 measures the invocation rate; an invocation rate above ~30% suggests the threshold needs revisiting. + +## 8. Dependency Graph + +**Must be implemented after**: C3 (input), C7 (inference runtime). + +**Can be implemented in parallel with**: C1, C6 — independent paths. + +**Blocks**: C4, F3 / F6. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `RefinerBackboneError` | `AdHoP backbone failure; frame=12345; falling through to passthrough` | +| WARN | invocation rate over rolling 60 s exceeds budget | `AdHoP invocation rate 38% > target 25%; frame=12345` | +| INFO | Strategy ready | `Refiner ready: strategy=adhop, threshold=2.5px` | +| DEBUG | per-frame invoked/passthrough decision | `Refiner frame=12345 invoked=true added_latency_ms=42` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN only). DEBUG goes to FDR's per-frame estimate stream as a flag (`refinement_label`). diff --git a/_docs/02_document/components/05_c3_5_adhop/tests.md b/_docs/02_document/components/05_c3_5_adhop/tests.md new file mode 100644 index 0000000..1caf184 --- /dev/null +++ b/_docs/02_document/components/05_c3_5_adhop/tests.md @@ -0,0 +1,104 @@ +# Test Specification — C3.5 AdHoP-Conditional Refinement + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-2.2 (cross-domain MRE on hard frames) | MRE < 2.5 px after refinement | FT-P-06, **C3.5-IT-01** | Covered | +| AC-4.1 | E2E latency budget under conditional invocation | NFT-PERF-01, **C3.5-PT-01** | Covered | + +--- + +## Component-Internal Tests + +### C3.5-IT-01: residual reduction under refinement + +**Summary**: when `refine_if_needed` is invoked (residual > threshold), the post-refinement residual is strictly lower than the pre-refinement residual on ≥90% of invocations. + +**Traces to**: AC-2.2 (hard-frame portion) + +**Description**: prepare a fixture of "hard frames" (synthetic perspective skew that pushes the C3 residual above the threshold); run C3 → C3.5; assert `MatchResult.per_candidate[best].per_candidate_residual_px` post-refinement < pre-refinement value in ≥90% of invocations. The remaining ≤10% are accepted (refinement can legitimately fail to improve a frame; passthrough is the documented fallback). + +**Input data**: `synthetic_matcher/hard_frames_perspective_skew_50f/`. + +**Expected result**: improvement rate ≥ 0.90. + +**Max execution time**: 5 min on Tier-1. + +--- + +### C3.5-IT-02: passthrough fall-through on `RefinerBackboneError` + +**Summary**: if the AdHoP backbone fails, C3.5 returns C3's original `MatchResult` unchanged with `refinement_label = "passthrough"`. + +**Traces to**: defensive (no AC trace; backstops the conditional-refinement design contract) + +**Description**: monkey-patch the AdHoP runtime to raise `RefinerBackboneError`; run 20 hard frames; assert (a) every frame produces a `MatchResult`, (b) `refinement_label == "passthrough"`, (c) error logged at ERROR level, (d) inlier coordinates equal C3's input coordinates bit-for-bit (no silent corruption). + +**Input data**: as C3.5-IT-01. + +**Expected result**: 20/20 passthrough; bit-identical correspondences. + +**Max execution time**: 60 s. + +--- + +### C3.5-IT-03: invocation rate stays under target on the steady-state path + +**Summary**: on the Derkachi normal segment, the AdHoP invocation rate is below the 25% target threshold; if it exceeds 30%, the test fails (signals a threshold mis-tune). + +**Traces to**: NFT-PERF-01 invocation-rate budget partition + +**Description**: replay Derkachi normal segment; count `was_invoked()` true returns over 60 s rolling windows; assert max invocation rate < 0.30 (warning at 0.25 per the component spec § 9 logging table). + +**Input data**: `flight_derkachi/normal_segment_60_stills/`. + +**Expected result**: max invocation rate < 0.30. + +**Max execution time**: 90 s. + +--- + +## Performance Tests + +### C3.5-PT-01: refinement-added latency budget + +**Traces to**: AC-4.1 + +**Load scenario**: 3 Hz, mixed normal + hard frames (25% hard) replicating realistic invocation rate. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `refine_if_needed` p95 (when invoked) | ≤ 90 ms (one extra backbone forward + perspective remap) | 150 ms | +| `refine_if_needed` p95 (passthrough) | ≤ 0.5 ms | 2 ms | +| Frame-aggregated added latency p95 | ≤ 25 ms (≤25% × 90 ms invocation; ≥75% × 0.5 ms passthrough) | 40 ms | + +--- + +## Security Tests + +C3.5 has no externally-reachable surface. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-06. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| `synthetic_matcher/hard_frames_perspective_skew_50f/` | generated, deterministic | ~50 MB | +| `flight_derkachi/normal_segment_60_stills/` | shared | shared | +| AdHoP TRT engine | C7-built artifact | shared | + +**Setup**: AdHoP engine must be compiled (deterministic; cached after first build). +**Teardown**: read-only. +**Data isolation**: per-test temp dirs. diff --git a/_docs/02_document/components/06_c4_pose/description.md b/_docs/02_document/components/06_c4_pose/description.md new file mode 100644 index 0000000..eb13b0d --- /dev/null +++ b/_docs/02_document/components/06_c4_pose/description.md @@ -0,0 +1,115 @@ +# C4 — Pose Estimation + +## 1. High-Level Overview + +**Purpose**: convert `MatchResult` (2D-3D correspondences) into a `PoseEstimate` — WGS84 position + 6×6 covariance + provenance label + `last_satellite_anchor_age_ms` — using OpenCV `solvePnPRansac` (IPPE) wrapped in GTSAM `Marginals` for native 6×6 posterior covariance recovery (D-C4-2 = (b)). Under thermal throttle, auto-degrades to Jacobian-based covariance (D-C4-2 = (a)) per the D-CROSS-LATENCY-1 hybrid. + +**Architectural Pattern**: single concrete implementation `OpenCVGtsamPoseEstimator` behind the `PoseEstimator` interface. The pose estimator and the state estimator (C5) **share the GTSAM substrate**; the C4 factor is added directly to C5's iSAM2 graph rather than computed in isolation. + +**Upstream dependencies**: +- C3.5 → `MatchResult` (refined or passthrough). +- C5 StateEstimator — supplies the GTSAM iSAM2 handle so C4 can add its factor in-graph (architecture principle: shared substrate per ADR-003). +- Camera calibration artifact — for intrinsics + distortion + body-to-camera extrinsics. +- C7 InferenceRuntime — only indirectly via the LightGlue inliers fed in from C3 / C3.5; C4 itself is OpenCV+GTSAM, not GPU-bound. + +**Downstream consumers**: +- C5 StateEstimator (consumes `PoseEstimate`). + +## 2. Internal Interfaces + +### Interface: `PoseEstimator` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `estimate` | `MatchResult, CameraCalibration, ThermalState` | `PoseEstimate` | No | `PnpFailureError`, `CovarianceDegradedWarning` | +| `current_covariance_mode` | `()` | `CovarianceMode` enum {MARGINALS, JACOBIAN} | No | — | + +**Input DTOs**: +``` +MatchResult: see C3 / C3.5 +CameraCalibration: see C5 +ThermalState: see C7 (telemetry from jetson-stats) +``` + +**Output DTOs**: +``` +PoseEstimate: + frame_id: uuid + position_wgs84: LatLonAlt — degrees, degrees, metres MSL + orientation_world_T_body: Quat (w, x, y, z) + covariance_6x6: Matrix6 — position (3x3) + orientation (3x3) sub-matrices + covariance_mode: CovarianceMode {MARGINALS, JACOBIAN} + source_label: enum {satellite_anchored, visual_propagated, dead_reckoned} + last_satellite_anchor_age_ms: int — bin input for AC-1.3 + emitted_at: monotonic_ns +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +Stateless w.r.t. persistent storage; reads camera calibration once at construction. + +## 5. Implementation Details + +**Algorithmic Complexity**: `solvePnPRansac` is `O(I · trials)` in inlier count and RANSAC trials; `Marginals.marginalCovariance(pose_key)` is `O(K^3)` in keyframe-window size for the steady-state path (D-C5-3 K=10–20). Jacobian-degraded mode is `O(I)`. + +**State Management**: +- Stateless w.r.t. flight history (C5 owns history). +- Holds the GTSAM `Marginals` factor handle and the OpenCV `solvePnPRansac` configuration. +- Holds a reference to the shared GTSAM iSAM2 graph owned by C5 — does **not** own it. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| OpenCV | ≥ 4.12.0 (CVE-2025-53644 mitigation) | `solvePnPRansac` with `SOLVEPNP_IPPE` flag; D-C4-1 = (b) | +| GTSAM (Python + C++) | per Plan-phase pin | `Marginals.marginalCovariance(pose_key)` for native 6×6 covariance | +| Eigen | matches GTSAM | Lie-algebra math | + +**Error Handling Strategy**: +- `PnpFailureError`: RANSAC convergence failure or degenerate match geometry. Emit no `PoseEstimate`; C5 falls back to VIO-only with provenance label `visual_propagated`. +- `CovarianceDegradedWarning`: thermal-throttle telemetry crossed the configurable threshold; auto-switch to Jacobian-based covariance (D-CROSS-LATENCY-1). Emit `PoseEstimate` with `covariance_mode = JACOBIAN`. NOT a fatal condition. +- The thermal-throttle decision is per-frame; once telemetry returns below threshold, switch back to MARGINALS on the next frame. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `SE3Utils` | shared with C1, C5 | C1, C4, C5 | +| `WgsConverter` | local-tangent-plane ↔ WGS84 latitude/longitude/altitude | C4, C8 | +| `RansacFilter` | shared RANSAC + reprojection residual | C3, C3.5, C4 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- Posterior covariance accuracy depends on the GTSAM substrate being healthy. C5's iSAM2 instability propagates into C4's covariance honesty. +- Jacobian-degraded covariance is a known accuracy trade (~5–10% loss per ADR-006); accepted under thermal throttle, never accepted on the steady-state path. + +**Potential race conditions**: +- Concurrent calls to `estimate` would race on the shared GTSAM graph. Single-threaded hot path; composition root binds C4 + C5 to the same thread. + +**Performance bottlenecks**: +- `Marginals.marginalCovariance(pose_key)` is the dominant cost in steady state (~30–90 ms). The D-CROSS-LATENCY-1 hybrid trades this for ~5–15 ms Jacobian under thermal throttle. + +## 8. Dependency Graph + +**Must be implemented after**: C3.5 (input), C5 (shared GTSAM substrate; circular at the design level — both must be co-developed but C5's iSAM2 graph must be constructable without C4 calling into it). + +**Can be implemented in parallel with**: C1, C6 — independent paths. + +**Blocks**: C5 (graph factor add depends on C4), F3 / F6 / F7. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `PnpFailureError` | `C4 PnP failure on frame=12345; mode=marginals; falling back to visual_propagated` | +| WARN | `CovarianceDegradedWarning`; thermal throttle entered | `C4 covariance degraded to JACOBIAN; thermal_throttle=true; clock=mhz=750` | +| INFO | Strategy ready | `C4 ready: estimator=opencv_gtsam, default_covariance=MARGINALS` | +| DEBUG | per-frame inlier count + residual + chosen mode | `C4 frame=12345 inliers=412 residual=0.9px mode=MARGINALS` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN always; DEBUG only via FDR per-frame estimate row). diff --git a/_docs/02_document/components/06_c4_pose/tests.md b/_docs/02_document/components/06_c4_pose/tests.md new file mode 100644 index 0000000..18c7e49 --- /dev/null +++ b/_docs/02_document/components/06_c4_pose/tests.md @@ -0,0 +1,123 @@ +# Test Specification — C4 Pose Estimator + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-1.1 | Frame-center GPS within 50 m for ≥80% of normal-flight photos | FT-P-01, **C4-IT-01** | Covered | +| AC-1.2 | Frame-center GPS within 20 m for ≥50% | FT-P-01, **C4-IT-01** | Covered | +| AC-1.4 | Estimate reports 95% covariance + source label | FT-P-03, **C4-IT-02** | Covered | +| AC-4.1 | E2E latency <400 ms p95 (incl. Marginals) | NFT-PERF-01, **C4-PT-01** | Covered | +| AC-NEW-5 | Operating envelope; thermal-throttle-driven covariance degradation hybrid | NFT-LIM-04, **C4-IT-03** | Covered (workstation portion) | + +--- + +## Component-Internal Tests + +### C4-IT-01: PnP convergence + WGS84 accuracy on Derkachi + +**Summary**: for the Derkachi normal segment, p80 of frame-center positions land within 50 m of ground truth, p50 within 20 m. + +**Traces to**: AC-1.1, AC-1.2 + +**Description**: feed `MatchResult` from C3 (Derkachi normal-segment fixture) into `estimate`; convert `position_wgs84` to local frame; compute distance to recorded GPS ground truth; assert p80 ≤ 50 m and p50 ≤ 20 m. + +**Input data**: `flight_derkachi/normal_segment_60_stills/` (with recorded GPS ground truth) + corresponding C3 `MatchResult` outputs (replayed from a recorded fixture so the test is C4-isolated). + +**Expected result**: p80 ≤ 50 m, p50 ≤ 20 m. + +**Max execution time**: 60 s. + +--- + +### C4-IT-02: 6×6 covariance is SPD and honest under match degradation + +**Summary**: every emitted `PoseEstimate` carries an SPD `covariance_6x6`; under inlier degradation (synthetically reduced inlier count), the covariance norm rises monotonically. + +**Traces to**: AC-1.4 + +**Description**: replay 100 frames; for each, assert (a) covariance is symmetric and positive-definite, (b) when a fixture-injected inlier-degradation event occurs at frame 50, the cov norm rises and stays elevated for ≥10 frames. Also assert `source_label` reflects the gate state — `satellite_anchored` only when C5's gate confirms; `visual_propagated` otherwise. + +**Input data**: `synthetic_matcher/inlier_degradation_at_f50/`. + +**Expected result**: 100/100 frames pass SPD invariant; cov norm rises ≥1.5× steady-state for the degradation interval. + +**Max execution time**: 60 s. + +--- + +### C4-IT-03: D-CROSS-LATENCY-1 hybrid auto-degrade switch + +**Summary**: when `ThermalState.throttle == true`, `current_covariance_mode()` returns `JACOBIAN`; when false, returns `MARGINALS`. The switch is per-frame. + +**Traces to**: AC-NEW-5 (workstation-baseline portion; hot-soak chamber deferred) + +**Description**: drive a 60 s replay alternating thermal flag every 5 s; assert `covariance_mode` in each emitted `PoseEstimate` matches the input thermal flag for that frame; assert no jitter or hysteresis (the spec calls for per-frame decision). + +**Input data**: synthetic frames + a `ThermalState` injection harness. + +**Expected result**: 100% match between thermal input and `covariance_mode` output. + +**Max execution time**: 90 s. + +--- + +### C4-IT-04: shared-graph integration with C5 + +**Summary**: factors added by C4 to C5's iSAM2 graph survive an `update`/`marginalCovariance` cycle without corrupting prior keyframes. + +**Traces to**: AC-1.4 (defensive — backstops the C4↔C5 shared-substrate co-dependency per ADR-003) + +**Description**: in a single test process, instantiate C5 with a 10-keyframe synthetic prior; have C4 add a `GenericProjectionFactorCal3DS2` for keyframe 11; trigger iSAM2 update; assert (a) the previously-known keyframe poses change by less than configurable tolerance (10 cm position, 0.5° rotation), (b) marginals on every keyframe remain SPD. + +**Input data**: synthetic 10-keyframe iSAM2 prior + a known correspondence set. + +**Expected result**: prior-keyframe perturbations within tolerance; SPD invariant holds. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C4-PT-01: Marginals vs Jacobian path latency on Tier-2 + +**Traces to**: AC-4.1 + +**Load scenario**: 3 Hz, 10 min replay; thermal flag toggled every 30 s to exercise both modes. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `estimate` p95 (MARGINALS, K=15) | ≤ 90 ms | 130 ms | +| `estimate` p95 (JACOBIAN) | ≤ 15 ms | 25 ms | +| Mode-switch latency | < 1 frame (i.e., next-frame switch) | > 1 frame | + +--- + +## Security Tests + +C4 has no externally-reachable surface. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-01 / FT-P-03 / FT-P-09-AP / FT-P-09-iNav. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Replayed C3 `MatchResult` for Derkachi | recorded once via fixture-build script | ~30 MB | +| `synthetic_matcher/inlier_degradation_at_f50/` | generated | ~10 MB | +| Synthetic 10-keyframe iSAM2 prior | scripted | <1 MB | + +**Setup**: replay fixture must be re-recorded if C3's output schema changes (versioned in `tests/fixtures/c4_inputs//`). +**Teardown**: read-only. +**Data isolation**: per-test temp dirs. diff --git a/_docs/02_document/components/07_c5_state/description.md b/_docs/02_document/components/07_c5_state/description.md new file mode 100644 index 0000000..9c1e000 --- /dev/null +++ b/_docs/02_document/components/07_c5_state/description.md @@ -0,0 +1,134 @@ +# C5 — State Estimator + +## 1. High-Level Overview + +**Purpose**: own the GTSAM `iSAM2` + `IncrementalFixedLagSmoother` (K=10–20 keyframes per D-C5-3) state. Fuse `VioOutput` (C1), `PoseEstimate` (C4), and FC IMU/attitude windows (C8 inbound) into the posterior pose with native 6×6 covariance via `Marginals` (D-C5-5 = (c)). Emit the smoothed corrected current frame to C8 for FC delivery; emit smoothed past-keyframes to C13 (FDR only — AC-4.5 internal smoothing, NOT FC retroactive correction). + +**Architectural Pattern**: Strategy with two concrete implementations: `GtsamIsam2StateEstimator` (production-default) and `EskfStateEstimator` (mandatory simple-baseline). Selection at startup (ADR-001), `BUILD_*` gating (ADR-002), composition-root wired (ADR-009). + +**Upstream dependencies**: +- C1 → `VioOutput` (relative pose + IMU bias). +- C4 → `PoseEstimate` (absolute satellite-anchored pose); C4 adds factors directly to C5's iSAM2 graph (shared substrate). +- C8 inbound side → FC `ImuWindow` + `AttitudeWindow` + `GpsHealth` (for warm-start AC-5.1, blackout AC-NEW-8, spoofing-promotion AC-NEW-2 / F7). + +**Downstream consumers**: +- C8 outbound side (per-FC encoder) → `EmittedExternalPosition` (5 Hz periodic to FC). +- C6 (mid-flight tile gen via orthorectifier; C5 supplies the `PoseEstimate` + quality_metadata for tile emission). +- C13 FDR (smoothed past-keyframe estimates, source-set switch events, spoofing-rejection events). + +## 2. Internal Interfaces + +### Interface: `StateEstimator` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `add_vio` | `VioOutput` | `None` | No | `EstimatorDegradedError`, `EstimatorFatalError` | +| `add_pose_anchor` | `PoseEstimate` | `None` | No | `EstimatorDegradedError`, `EstimatorFatalError` | +| `add_fc_imu` | `ImuWindow` | `None` | No | `EstimatorDegradedError` | +| `current_estimate` | `()` | `EstimatorOutput` (smoothed current keyframe) | No | — | +| `smoothed_history` | `n_keyframes: int` | `list[EstimatorOutput]` | No | — | +| `health_snapshot` | `()` | `EstimatorHealth` | No | — | + +**Input DTOs**: see C1, C4, C8. + +**Output DTOs**: +``` +EstimatorOutput: + frame_id: uuid + position_wgs84: LatLonAlt + orientation_world_T_body: Quat (w, x, y, z) + velocity_world: Vector3 (m/s) + covariance_6x6: Matrix6 + source_label: enum {satellite_anchored, visual_propagated, dead_reckoned} + last_satellite_anchor_age_ms: int + smoothed: bool — true for entries from `smoothed_history` + emitted_at: monotonic_ns + +EstimatorHealth: + isam2_state: enum {INIT, TRACKING, DEGRADED, LOST} + keyframe_count: int + cov_norm_growing_for_s: float — AC-NEW-8 monotonicity check + spoof_promotion_blocked: bool — AC-NEW-2 / AC-NEW-8 gate state +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +C5 holds the GTSAM iSAM2 state in memory; persistent storage is only via FDR writes (C13 owns the file). No DB queries. + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| In-memory keyframe window | up to 20 keyframes resident | ~2 KB / keyframe (factors + values) | ~40 KB | bounded by IncrementalFixedLagSmoother K=10–20 | + +C5 is bounded by design — no unbounded growth. + +## 5. Implementation Details + +**Algorithmic Complexity**: +- iSAM2 update on factor add: amortised `O(K)` in keyframe count for the typical case; `O(K^2)` worst-case on relinearisation. +- `Marginals.marginalCovariance(pose_key)`: `O(K^3)` in keyframe-window size; the dominant per-frame cost (~30–90 ms steady-state). +- `IncrementalFixedLagSmoother` keeps the active window bounded — older keyframes are marginalised out. + +**State Management**: +- iSAM2 graph + Values + Marginals lifecycle for the flight. +- Source-label state machine: tracks the AC-NEW-2 / AC-NEW-8 spoofing-promotion gate (≥10 s + visual consistency check before re-promoting a previously-spoofed FC GPS source). +- Last-anchor-age timer for AC-1.3 binning. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| GTSAM (Python + C++) | per Plan-phase pin | iSAM2 + `CombinedImuFactor` + `BetweenFactorPose3` + `GenericProjectionFactorCal3DS2` + `Marginals` | +| `gtsam_unstable.IncrementalFixedLagSmoother` | per Plan-phase pin | Bounded keyframe window (D-C5-3 K=10–20) | +| Eigen | matches GTSAM | Lie-algebra math | + +**Error Handling Strategy**: +- `EstimatorDegradedError`: factor add yielded poor convergence; covariance inflated; emit `EstimatorOutput` with degraded label. +- `EstimatorFatalError`: iSAM2 numerical failure, KEYFRAME_LIMIT exceeded, etc.; emit no `EstimatorOutput` for this tick. AC-5.2 fallback (3 s no estimate → FC IMU-only) applies. +- Spoof-promotion gate: never re-introduce a previously-spoofed FC GPS source until BOTH (i) FC `gps_health == STABLE_NON_SPOOFED` for ≥ 10 s AND (ii) the next satellite-anchored frame agrees with the FC GPS within a configurable tolerance (AC-NEW-8). Document every reject in FDR + GCS STATUSTEXT. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `ImuPreintegrator` | shared with C1 | C1, C5 | +| `SE3Utils` | shared with C1, C4 | C1, C4, C5 | +| `WgsConverter` | shared with C4, C8 | C4, C5, C8 | +| `SourceLabelStateMachine` | spoofing-promotion gate logic | C5 only — keep inside the component | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- AC-4.5 internal smoothing is **onboard only**; the FC log is forward-time only. The smoothed past-keyframe estimates go to FDR, not back to the FC. +- iSAM2 + `IncrementalFixedLagSmoother` requires careful key management; missing keys cause silent factor-add failures — the implementation MUST log every `add_*` call's success/failure status. + +**Potential race conditions**: +- Single writer thread for the iSAM2 graph by design. C1 + C4 + C8-inbound deliver to a timestamp-ordered merge queue ahead of C5's writer thread. + +**Performance bottlenecks**: +- `Marginals.marginalCovariance(pose_key)` is the per-frame hot spot. D-CROSS-LATENCY-1 hybrid degrades C4's covariance recovery (not C5's) under thermal throttle. + +## 8. Dependency Graph + +**Must be implemented after**: C1 (input), C4 (input + shared graph), C8 inbound (FC IMU prior). + +**Can be implemented in parallel with**: C6, C13 — independent paths. + +**Blocks**: C8 outbound (no per-frame estimate), F3 / F4 / F5 / F7 / F9 / F10. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `EstimatorFatalError`; iSAM2 numerical failure; AC-5.2 path imminent | `C5 fatal iSAM2 failure; frame=12345; AC-5.2 fallback` | +| WARN | `EstimatorDegradedError`; spoofing-promotion blocked; cov norm growing >2× steady | `C5 degraded: cov_inflation=3.1, spoof_block=true` | +| INFO | Strategy ready; warm-start applied; spoof-promotion gate state changes | `C5 ready: estimator=gtsam_isam2, K=15` | +| DEBUG | per-frame factor adds + smoothed history depth | `C5 frame=12345 vio_added=true pose_added=true imu_added=true smoothed_n=15` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN always; smoothed past-keyframe entries always go to FDR per AC-4.5; spoofing-promotion-block events go to FDR + GCS STATUSTEXT). diff --git a/_docs/02_document/components/07_c5_state/tests.md b/_docs/02_document/components/07_c5_state/tests.md new file mode 100644 index 0000000..125f996 --- /dev/null +++ b/_docs/02_document/components/07_c5_state/tests.md @@ -0,0 +1,189 @@ +# Test Specification — C5 State Estimator + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-1.3 | Cumulative drift between satellite-anchored fixes | FT-P-02, **C5-IT-01** | Covered | +| AC-1.4 | 95% covariance + source label | FT-P-03, **C5-IT-02** | Covered | +| AC-3.5 | Visual blackout + spoofed-GPS failsafe | FT-N-04, **C5-IT-03** | Covered | +| AC-4.5 (revised) | Internal smoothing of past keyframes (NOT FC retroactive correction) | FT-P-10, **C5-IT-04** | Covered | +| AC-5.2 | On >3 s without estimate, FC IMU-only fallback | NFT-RES-01, **C5-IT-05** | Covered | +| AC-NEW-2 | Spoofing-promotion latency <3 s p95 | NFT-PERF-04, **C5-IT-06** | Covered | +| AC-NEW-8 | Visual blackout + spoof degraded-mode escalation | FT-N-04, NFT-RES-04, **C5-IT-07** | Covered | + +--- + +## Component-Internal Tests + +### C5-IT-01: source_label state machine produces correct `last_satellite_anchor_age_ms` + +**Summary**: after a satellite-anchored frame, `last_satellite_anchor_age_ms` resets to the frame's age; under visual-propagated frames it monotonically increases. + +**Traces to**: AC-1.3 (binning input) + +**Description**: scripted sequence — 3 satellite-anchored frames, then 30 s of visual-propagated, then another satellite-anchored. Assert `last_satellite_anchor_age_ms` resets at the anchored events and rises monotonically between them with ms-level resolution. + +**Input data**: scripted `EstimatorOutput` sequence. + +**Expected result**: monotonic between resets; resets within 100 ms of the anchored frame's `emitted_at`. + +**Max execution time**: 60 s. + +--- + +### C5-IT-02: smoothed-current estimate honest covariance + +**Summary**: `current_estimate()` produces an SPD 6×6 covariance whose norm reflects the iSAM2 graph's actual posterior — not a fake-confidence value. + +**Traces to**: AC-1.4 + +**Description**: build a synthetic graph where the keyframe-11 absolute factor is intentionally noisy (3× covariance); assert C5's emitted covariance norm is at least 2× the steady-state norm of a clean graph. Repeat with a clean factor; assert ≤1.2× steady-state. + +**Input data**: synthetic factor-graph fixtures. + +**Expected result**: noisy → ≥2× norm; clean → ≤1.2× norm. + +**Max execution time**: 30 s. + +--- + +### C5-IT-03: VIO-only fallback under cross-domain matcher failure + +**Summary**: when C4 stops emitting `PoseEstimate` (matcher failure), C5 continues with VIO-only and labels `source_label = visual_propagated`. + +**Traces to**: AC-3.5 + +**Description**: feed `add_vio` for 60 s while withholding `add_pose_anchor` calls; assert (a) `current_estimate` keeps emitting, (b) `source_label == visual_propagated`, (c) `cov_norm_growing_for_s` rises monotonically. + +**Input data**: scripted VIO-only fixture. + +**Expected result**: 60 s of visual_propagated estimates; cov norm monotonically rising. + +**Max execution time**: 90 s. + +--- + +### C5-IT-04: smoothed past-keyframe history is NOT forwarded to FC + +**Summary**: `smoothed_history(n)` reflects past-keyframe smoothing per AC-4.5 (revised), but the FC emission path uses `current_estimate` only — the smoothing must NOT alter what C8 emits as `GPS_INPUT` / `MSP2_SENSOR_GPS`. + +**Traces to**: AC-4.5 (revised) + +**Description**: trigger an iSAM2 relinearisation that materially shifts a 5-keyframe-old pose; assert (a) `smoothed_history(10)` shows the shift, (b) the next 10 calls to `current_estimate` are unaffected, (c) the FDR record stream contains the smoothed history (per AC-4.5) AND the unshifted FC emissions in the same flight log. + +**Input data**: synthetic graph with a deliberately-late loop closure. + +**Expected result**: history shows shift; current_estimate unaffected; FDR has both streams. + +**Max execution time**: 60 s. + +--- + +### C5-IT-05: 3 s no-estimate threshold triggers AC-5.2 fallback + +**Summary**: when no `add_vio` and no `add_pose_anchor` arrive for >3 s, `current_estimate` returns a `dead_reckoned`-labeled output (or refuses to emit) so C8 can fall to FC IMU-only. + +**Traces to**: AC-5.2 + +**Description**: prime C5 with a normal warm-start; cease all input for 4 s; observe `current_estimate` over the gap; assert (a) at < 3 s gap, label is `visual_propagated`, (b) at ≥ 3 s gap, label transitions to `dead_reckoned` or no emission, (c) the transition timestamp is logged at ERROR level. + +**Input data**: scripted gap fixture. + +**Expected result**: transition at gap ≥ 3 s; ERROR logged. + +**Max execution time**: 30 s. + +--- + +### C5-IT-06: spoof-promotion gate enforces ≥10 s + visual consistency + +**Summary**: a previously-spoofed FC GPS source can only be re-promoted to trusted after BOTH (i) `gps_health == STABLE_NON_SPOOFED` for ≥10 s AND (ii) the next satellite-anchored frame agrees with FC GPS within tolerance. + +**Traces to**: AC-NEW-2, AC-NEW-8 + +**Description**: scripted scenario — initial trust → spoof event (gps_health = SPOOFED) → recovery to STABLE_NON_SPOOFED at t=0; satellite-anchored agreement frames at t=5 s, t=11 s. Assert promotion blocks until t=11 s + agreement; reject every promotion attempt before then; log every reject in FDR + STATUSTEXT. + +**Input data**: scripted `gps_health` + `EstimatorOutput` sequence. + +**Expected result**: promotion at t=11 s; rejects logged before then. + +**Max execution time**: 60 s. + +--- + +### C5-IT-07: visual blackout + spoof escalation + +**Summary**: simultaneous visual blackout (no C4 anchors) and spoofed FC GPS escalates to `dead_reckoned` source label and AC-NEW-8 STATUSTEXT. + +**Traces to**: AC-NEW-8 + +**Description**: combine the visual-blackout fixture (no `add_pose_anchor` for 5 s) with a `gps_health == SPOOFED` event; assert the next emission is `dead_reckoned` and an AC-NEW-8 STATUSTEXT is published via the C8 path (mocked C8 in the test harness records the outgoing message). + +**Input data**: combined scripted fixture. + +**Expected result**: `dead_reckoned` label; STATUSTEXT recorded in mock C8 harness. + +**Max execution time**: 30 s. + +--- + +## Performance Tests + +### C5-PT-01: iSAM2 + Marginals throughput on Tier-2 + +**Traces to**: AC-4.1 + +**Load scenario**: 3 Hz `add_pose_anchor` + 200 Hz `add_fc_imu` + 3 Hz `add_vio`; 10 min replay. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `add_pose_anchor` + `current_estimate` p95 | ≤ 60 ms | 100 ms | +| `add_fc_imu` p95 | ≤ 1 ms (preintegration buffer-add only) | 5 ms | +| `add_vio` p95 | ≤ 5 ms | 15 ms | + +**Resource limits**: +- Memory: bounded by `IncrementalFixedLagSmoother K=10–20`; ≤ 100 MB resident. + +--- + +## Security Tests + +### C5-ST-01: spoof-rejection logging cannot be silenced + +**Summary**: every spoof-promotion-block event lands in FDR + GCS STATUSTEXT — the system has no config knob that disables this. + +**Traces to**: AC-NEW-2 / AC-NEW-8 (defensive) + +**Test procedure**: +1. Configure C5 with the production-default config. +2. Inject a spoof-promotion-block event. +3. Assert the FDR record stream contains an entry with `kind = "spoof_promotion_block"` AND a STATUSTEXT was issued. +4. Search the codebase for any config flag that could disable either path; assert no such flag exists. + +**Pass criteria**: FDR + STATUSTEXT both recorded; no disabling config flag found. +**Fail criteria**: either path missing or a disabling flag exists. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-02 / FT-P-03 / FT-N-04 / NFT-RES-01. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Synthetic factor-graph fixtures | scripted | <5 MB | +| `flight_derkachi/normal_segment_60_stills/` (replayed VIO + pose feeds) | shared | shared | +| `gps_health` event fixtures | scripted | <1 MB | + +**Setup**: in-process; no external services. +**Teardown**: per-test temp dirs. +**Data isolation**: each test instantiates a fresh `StateEstimator`. diff --git a/_docs/02_document/components/08_c6_tile_cache/description.md b/_docs/02_document/components/08_c6_tile_cache/description.md new file mode 100644 index 0000000..3c2af75 --- /dev/null +++ b/_docs/02_document/components/08_c6_tile_cache/description.md @@ -0,0 +1,172 @@ +# C6 — Tile Cache + Spatial Index + +## 1. High-Level Overview + +**Purpose**: own the on-companion persistent imagery store (tiles + descriptor index + tile metadata) in a layout that is byte-identical to `satellite-provider`'s on-disk format, so the C11 `TileUploader` post-landing upload (F10) is a straight copy. Provide read access to C2 (descriptor index), C2.5 / C3 (tile pixels), and write access to the orthorectifier path during F4 (mid-flight tile generation), to the C11 `TileDownloader` during F1 (pre-flight tile fetch), and to C10 during F1 (Manifest + FAISS index write). + +**Architectural Pattern**: Repository — three concrete stores behind separate interfaces (`TileStore` for pixel + metadata I/O; `TileMetadataStore` for the Postgres spatial index; `DescriptorIndex` for FAISS HNSW). Single concrete implementation per interface today (`PostgresFilesystemStore`, `FaissDescriptorIndex`); future variants (e.g., RocksDB-backed metadata for resource-constrained tiers) can be added behind the same interfaces. + +**Upstream dependencies**: +- C11 `TileDownloader` (writes `tiles` rows + JPEGs during F1 pre-flight provisioning, source='googlemaps'). +- C10 CacheProvisioner (writes Manifest + FAISS index during F1 pre-flight provisioning, after C11 has populated tiles). +- C5 → orthorectifier → C6 (writes during F4 mid-flight tile generation, source='onboard_ingest'). +- C11 `TileUploader` (reads during F10). + +**Downstream consumers**: +- C2 VPR (reads descriptor index). +- C2.5 ReRanker (reads tile pixels). +- C3 CrossDomainMatcher (reads tile pixels). + +## 2. Internal Interfaces + +### Interface: `TileStore` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `read_tile_pixels` | `tile_id: TileId` | `TilePixelHandle` (mmap-backed) | No | `TileNotFoundError`, `TileFsError` | +| `write_tile` | `tile_blob: bytes, metadata: TileMetadata` | `None` | No | `TileFsError`, `TileMetadataError` | +| `tile_exists` | `tile_id: TileId` | `bool` | No | — | + +### Interface: `TileMetadataStore` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `query_by_bbox` | `bbox, zoom` | `list[TileMetadata]` | No | `TileMetadataError` | +| `insert_metadata` | `TileMetadata` | `None` | No | `TileMetadataError` | +| `mark_voting_status` | `tile_id, status: VotingStatus` | `None` | No | `TileMetadataError` | + +### Interface: `DescriptorIndex` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `search_topk` | `query: ndarray[D], k: int` | `list[(tile_id, distance)]` | No | `IndexUnavailableError` | +| `descriptor_dim` | `()` | `int` | No | — | +| `mmap_handle` | `()` | filesystem path / FAISS reader handle | No | — | + +**Input/Output DTOs**: +``` +TileId: composite (zoomLevel: int, lat: float, lon: float) + +TileMetadata: + tile_id: TileId + tile_size_meters: float + tile_size_pixels: int + capture_timestamp: ISO 8601 datetime + source: enum {googlemaps, onboard_ingest} + freshness_label: enum {fresh, stale_active_conflict, stale_rear, downgraded} + flight_id: uuid (optional, set for onboard_ingest) + companion_id: string (optional, set for onboard_ingest) + quality_metadata: TileQualityMetadata (optional, set for onboard_ingest) + voting_status: enum {pending, trusted, rejected} (default pending for onboard_ingest) + +TileQualityMetadata: see data_model.md (TileQualityMetadata entity) + +TilePixelHandle: opaque (filesystem path + mmap pointer; consumer must not copy) +``` + +## 3. External API Specification + +Not applicable — internal-only; C11 `TileUploader` reads via `TileStore` for upload to `satellite-provider` over an external HTTP API, and C11 `TileDownloader` writes via `TileStore` after fetching from `satellite-provider`. C11 owns those API calls, not C6. + +## 4. Data Access Patterns + +### Queries + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| `read_tile_pixels` (3 candidates per frame from C2.5/C3) | 9 Hz | Yes | Filesystem mmap (page-cache backed) | +| `search_topk` from C2 (top-K=10) | 3 Hz | Yes | FAISS HNSW (.index file) | +| `query_by_bbox` from C10/C11/C12 (cache build + post-landing) | offline / pre-flight | No | btree spatial index on (zoomLevel, lat, lon) | +| `insert_metadata` + `write_tile` from F4 orthorectifier | bursty during flight (≤ a few Hz on average) | No (background) | btree spatial index | + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| Tile pixels | OS page cache (filesystem mmap) | flight lifetime | None — files are append-only during flight; F1 wipes for next flight | +| FAISS descriptor index | mmap (`IO_FLAG_MMAP_IFC`) | flight lifetime | F1 rebuild + content-hash gate | +| Tile metadata rows | Postgres internal page cache + connection pool | flight lifetime | F4 inserts append; F1 may truncate-and-reload | + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| `tiles` (Postgres rows) | ~ tens to hundreds of thousands per cached area | ~256 B per row | up to ~100 MB | bounded by 10 GB AC-8.3 cache budget overall | +| `tiles/{zoomLevel}/{x}/{y}.jpg` | matches `tiles` row count | 50–200 KB / tile | up to ~9 GB (after metadata/index overheads) | bounded by AC-8.3 | +| FAISS HNSW `.index` | one file per provisioning | 200 MB–1 GB depending on backbone descriptor dim | bounded by AC-8.3 carve-out per D-C2-6/9/10 | F1 atomic rebuild | +| Onboard mid-flight ingest tiles (F4) | a few hundred per flight | same as above | bounded by AC-8.3 carve-out | per flight | + +### Data Management + +**Seed data**: F1 pre-flight provisioning is the seeding step (operator runs C12 → C11 `TileDownloader` (tiles) → C10 (engines + descriptors + manifest) → C6). + +**Rollback**: F1 is idempotent (D-C10-1 manifest-hash check). Mid-flight writes during F4 are append-only; on F8 reboot, partially-written tiles are detected via the SHA-256 content-hash gate (D-C10-3) at takeoff, and the corrupted tile is dropped. + +## 5. Implementation Details + +**Algorithmic Complexity**: +- HNSW search: `O(log N)` in corpus size. +- bbox query: btree index `O(log N)` per probe; result-set scan `O(R)` in matched rows. + +**State Management**: +- `TileStore` is stateless (filesystem-backed). +- `TileMetadataStore` holds a Postgres connection pool. +- `DescriptorIndex` holds the FAISS reader + mmap'd index file handle; lifetime = flight. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| PostgreSQL (server + libpq) | 16.x (mirror of `satellite-provider`'s pin) | Spatial metadata index | +| psycopg / asyncpg | per project pin | Python Postgres client | +| FAISS (Python + C++) | upstream HEAD pinned per Plan-phase | HNSW retrieval | +| atomicwrites | latest | Atomic file replacement for `.index` rebuild (D-C10-3) | +| hashlib (stdlib) | stdlib | SHA-256 content-hash sidecars | + +**Error Handling Strategy**: +- `TileNotFoundError`: tile pixel file missing despite metadata row present. Log + drop the candidate; signal cache inconsistency to C13 for FDR. +- `TileFsError`: I/O error on filesystem read/write. F4 writes retry once with backoff; reads do not retry (the candidate is dropped). +- `TileMetadataError`: Postgres failure. Pre-flight: F1 fails fast; takeoff blocked. Mid-flight: F4 writes drop the tile (logged); reads for C2's top-K fail to enrich and downstream uses descriptor distances only (degraded but not fatal). +- `IndexUnavailableError`: FAISS handle invalid (e.g., file replaced concurrently). Treated as fatal in flight; F8 reboot recovery re-mmaps. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `WgsConverter` | shared with C4, C5, C8 | C4, C5, C6 (bbox queries), C8 | +| `Sha256Sidecar` | atomic write + SHA-256 content-hash sidecar pattern (D-C10-3) | C6, C10 | +| `OrthorectifierUtils` | minimal orthorectification math (used by F4 write path) | C5, C6 — keep inside the F4 boundary; if needed by more components, promote | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- The on-disk layout MUST mirror `satellite-provider` exactly so F10 upload is byte-identical. Any deviation breaks AC-8.4. Verified by NFT-SEC-01 and IT-7. +- Postgres on a Jetson is heavyweight; runtime resource budget validated by NFT-LIM-01 (8 h replay). + +**Potential race conditions**: +- F4 writes can race with C2/C2.5/C3 reads on the same tile filesystem. Solution: F4 writes use `atomicwrites` for atomic rename; readers see either the old version or the new version, never partial. +- FAISS `.index` mmap is read-only during a flight; F1 would never overwrite during a flight (F1 happens pre-flight only). + +**Performance bottlenecks**: +- Postgres metadata writes during F4 are bursty; if bursts approach 10 Hz, the Postgres connection becomes a bottleneck. Bound by configurable backpressure on the F4 path. +- FAISS HNSW first query pays the page-in cost (multi-second). F2 takeoff load forces a warm-up query so F3 first frame is cheap. + +## 8. Dependency Graph + +**Must be implemented after**: external `satellite-provider` on-disk layout reference (read its README + table schema; mirror exactly). + +**Can be implemented in parallel with**: C7 — independent paths. + +**Blocks**: C2, C2.5, C3, C10, C11 (both `TileDownloader` and `TileUploader`), F1, F2, F3, F4, F8, F10. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `TileNotFoundError` (cache inconsistency); `IndexUnavailableError` | `Tile pixel missing for tile_id=(z=18,lat=49.71,lon=37.45)` | +| WARN | F4 write retried; bbox query slower than threshold | `F4 tile write retry: tile_id=…; reason=fs_busy` | +| INFO | Provisioning complete; index loaded | `C6 ready: tiles=87654, faiss_dim=512, index_size_mb=412` | +| DEBUG | per-query timing | `C6 search_topk frame=12345 took=0.4ms; bbox_query took=2.1ms` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN only; the FDR also captures every successful F4 write as part of the mid-flight-tile-gen log). diff --git a/_docs/02_document/components/08_c6_tile_cache/tests.md b/_docs/02_document/components/08_c6_tile_cache/tests.md new file mode 100644 index 0000000..14c1e21 --- /dev/null +++ b/_docs/02_document/components/08_c6_tile_cache/tests.md @@ -0,0 +1,168 @@ +# Test Specification — C6 Tile Cache + Spatial Index + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-8.1 | Imagery via Suite Sat Service offline cache, ≥0.5 m/px | FT-P-15, FT-P-16, **C6-IT-01** | Covered | +| AC-8.2 | Tile freshness <6 mo (active-conflict) / <12 mo (rear) | FT-N-05, **C6-IT-02** | Covered | +| AC-8.4 | Mid-flight tile generation with quality metadata | FT-P-17, **C6-IT-03** | Covered | +| AC-NEW-3 | FDR ≤64 GB / flight, no silent drops (mid-flight tiles count toward C13 — C6 must not block) | NFT-LIM-02, **C6-IT-04** | Covered | +| AC-NEW-6 | System rejects/downgrades stale tiles | FT-N-05, FT-N-06, **C6-IT-05** | Covered | +| RESTRICT-SAT-2 | Cache budget 10 GB across operational area | NFT-LIM-03, **C6-IT-06** | Covered | + +--- + +## Component-Internal Tests + +### C6-IT-01: filesystem layout byte-identical to `satellite-provider` + +**Summary**: tiles written by C11 `TileDownloader` and tiles emitted at mid-flight by C5/C6 produce identical filesystem paths and JPEG byte content for the same `(zoomLevel, lat, lon)` coordinate. + +**Traces to**: AC-8.1 (interop with `satellite-provider` for the upload return path) + +**Description**: download a known tile via C11 TileDownloader (against the real `satellite-provider` mock fixture); generate the same tile via the mid-flight orthorectification path (C5 + C6); assert (a) filesystem path equality, (b) JPEG bytes equality, (c) `tiles` table row equality on `(zoomLevel, lat, lon, source, content_sha256)`. + +**Input data**: a single Derkachi-area tile (`zoom=18, lat≈49.94, lon≈36.31`). + +**Expected result**: byte-identical layout. + +**Max execution time**: 10 s. + +--- + +### C6-IT-02: freshness gate at write-time + +**Summary**: a tile with `produced_at < now - active_conflict_max_age` is rejected at C6 insert if the target sector is `active_conflict`. + +**Traces to**: AC-8.2 + +**Description**: insert a synthetic tile with `produced_at = now - 7 months` into an `active_conflict` sector; assert C6 raises a freshness-rejection error and does NOT write to disk or DB. Repeat with `stable_rear` — assert the tile is written but flagged `freshness_downgraded`. + +**Input data**: synthetic tile fixtures with controlled timestamps. + +**Expected result**: rejection in active_conflict; downgrade-write in stable_rear. + +**Max execution time**: 5 s. + +--- + +### C6-IT-03: mid-flight tile insert with quality metadata + +**Summary**: every tile written via the F4 mid-flight gen path has full `quality_metadata` (covariance, inlier count, source-label provenance). + +**Traces to**: AC-8.4 + +**Description**: simulate F4 — feed C5 a 60 s sequence; the orthorectifier emits N tiles; assert each tile's row in `tiles` has non-null `quality_metadata.covariance_norm`, `quality_metadata.inlier_count`, `quality_metadata.source_label`. Empty quality metadata fails the test. + +**Input data**: scripted F4 fixture. + +**Expected result**: 100% of mid-flight tiles have full metadata. + +**Max execution time**: 90 s. + +--- + +### C6-IT-04: write throughput exceeds peak F4 demand + +**Summary**: C6 sustains the peak mid-flight write rate (1 tile / 2 s typical, ~5/s peak) without queueing-induced backpressure on C5. + +**Traces to**: AC-NEW-3 (no-silent-drop gate) + +**Description**: synthetic burst — 100 tiles enqueued at 5 Hz; assert (a) all 100 land in `tiles` table within 30 s, (b) no tile dropped, (c) C5's write-side queue depth never exceeds the configured ceiling. + +**Input data**: synthetic burst fixture. + +**Expected result**: 100/100 written; no drops. + +**Max execution time**: 60 s. + +--- + +### C6-IT-05: stale-tile reject + downgrade across the F1 / F4 / F5 boundary + +**Summary**: in F5 (steady-state retrieval), a tile flagged `freshness_downgraded` is returned with the downgrade flag, and any consumer that depends on freshness (e.g., spoof-rejection check) sees the flag. + +**Traces to**: AC-NEW-6 + +**Description**: write a `freshness_downgraded` tile via C6-IT-02's stable_rear path; query via the C2/C3 fetch path; assert the returned tile carries the downgrade flag and that the consumer-side label propagates into the `EstimatorOutput.source_label`. + +**Input data**: as C6-IT-02. + +**Expected result**: downgrade flag survives the round-trip. + +**Max execution time**: 10 s. + +--- + +### C6-IT-06: 10 GB cache budget enforcement + +**Summary**: when the operational-area working set hits 10 GB, C6 evicts oldest tiles by LRU; never silently dropping a tile that would put the system over budget. + +**Traces to**: RESTRICT-SAT-2 + +**Description**: synthetic load that fills the cache to 10 GB - 50 MB; insert another 100 MB of tiles; assert (a) total disk usage stays ≤ 10 GB, (b) eviction count matches insert count above the threshold, (c) every eviction is logged at INFO with the evicted tile_id. + +**Input data**: synthetic tile generator. + +**Expected result**: 10 GB cap held; LRU eviction visible in logs. + +**Max execution time**: 5 min on Tier-1 (disk-bound). + +--- + +## Performance Tests + +### C6-PT-01: per-tile read latency on Tier-2 (mmap hit) + +**Traces to**: AC-4.1 (read-side; mmap is the design assumption for C2/C2.5/C3 hot paths) + +**Load scenario**: 3 Hz × 10 candidate tiles per frame × 10 min replay; OS page cache warm. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `get_tile_pixels` p95 (warm) | ≤ 0.5 ms | 5 ms | +| `get_tile_pixels` p95 (cold first read) | ≤ 50 ms | 200 ms | + +--- + +## Security Tests + +### C6-ST-01: content-sha256 mismatch rejects insert + +**Summary**: a tile insert with a content_sha256 that doesn't match the actual JPEG bytes is rejected at C6 (defends against the cache-poisoning path D-C10-3 covers at takeoff). + +**Traces to**: defensive (D-C10-3 + AC-NEW-7 cumulative) + +**Test procedure**: +1. Compute SHA-256 of a known JPEG. +2. Submit insert with the JPEG bytes + a deliberately-wrong SHA-256. +3. Assert insert is rejected and the DB row is not created. + +**Pass criteria**: insert rejection + no row. +**Fail criteria**: insert accepted or partial row created. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-15 / FT-P-16 / FT-P-17 / FT-N-05 / FT-N-06. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Single Derkachi tile (`zoom=18`) | curated | <1 MB | +| Synthetic tile fixtures with controlled timestamps | scripted | ~50 MB | +| Synthetic 10 GB cache fill set | scripted, deterministic | 10 GB on disk | +| `flight_derkachi/normal_segment_60_stills/` for F4 simulation | shared | shared | + +**Setup**: ephemeral Postgres + filesystem under `tests/tmp/c6//db/` and `…/tiles/`. +**Teardown**: drop tmp directory. +**Data isolation**: per-test ephemeral DB + filesystem. diff --git a/_docs/02_document/components/09_c7_inference/description.md b/_docs/02_document/components/09_c7_inference/description.md new file mode 100644 index 0000000..c43e5ac --- /dev/null +++ b/_docs/02_document/components/09_c7_inference/description.md @@ -0,0 +1,155 @@ +# C7 — On-Jetson Inference Runtime + +## 1. High-Level Overview + +**Purpose**: provide a single inference-runtime abstraction that all GPU-using components (C1 selectively, C2, C2.5, C3, C3.5) consume. Owns engine compilation (Polygraphy / trtexec / IBuilderConfig hybrid), engine deserialization at takeoff load, GPU memory management, INT8 calibration cache trust, and the thermal-throttle telemetry feed that drives the D-CROSS-LATENCY-1 hybrid in C4. + +**Architectural Pattern**: Strategy — `InferenceRuntime` interface with three concrete implementations: `TensorrtRuntime` (production-default per D-C7-9 JetPack 6.2 + TensorRT 10.3 lock), `OnnxTrtEpRuntime` (fallback), `PytorchFp16Runtime` (mandatory simple-baseline). Selection at startup by config (ADR-001), build-time gating by `BUILD_*` flags (ADR-002), composition-root wired (ADR-009). + +**Upstream dependencies**: +- C10 CacheProvisioner → during F1 (after C11 `TileDownloader` has populated C6) triggers engine compilation when no cached engine matches the `(SM, JP, TRT, precision)` tuple. +- F2 takeoff load → triggers `deserialize_cached_engine` for every model used by C1/C2/C2.5/C3/C3.5. +- jetson-stats / NVML → thermal-throttle telemetry source. + +**Downstream consumers**: +- C2 VPR (backbone forward pass). +- C2.5 ReRanker (LightGlue forward pass). +- C3 CrossDomainMatcher (DISK / LightGlue / ALIKED / XFeat forward passes). +- C3.5 AdHoP (conditional refinement backbone). +- C1 (only the strategies that have a CUDA path; KltRansac is CPU-only). +- C4 (consumes `ThermalState` for the D-CROSS-LATENCY-1 covariance-mode decision). + +## 2. Internal Interfaces + +### Interface: `InferenceRuntime` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `compile_engine` | `model_path: Path, build_config: BuildConfig` | `EngineCacheEntry` | No (offline) | `EngineBuildError`, `CalibrationCacheError` | +| `deserialize_engine` | `EngineCacheEntry` | `EngineHandle` | No | `EngineDeserializeError` | +| `infer` | `EngineHandle, inputs: dict[str, Tensor]` | `dict[str, Tensor]` | No (sync GPU stream) | `InferenceError`, `OutOfMemoryError` | +| `release_engine` | `EngineHandle` | `None` | No | — | +| `thermal_state` | `()` | `ThermalState` | No | `TelemetryUnavailableError` | +| `current_runtime_label` | `()` | `string` | No | — | + +**Input/Output DTOs**: +``` +BuildConfig: + precision: enum {fp16, int8, mixed} + workspace_mb: int + calibration_dataset: Path (required for int8) + optimization_profiles: list[(input_name, min_shape, opt_shape, max_shape)] + +EngineCacheEntry: see data_model.md +EngineHandle: opaque GPU-resident handle + +ThermalState: + cpu_temp_c: float + gpu_temp_c: float + thermal_throttle_active: bool + measured_clock_mhz: int + measured_at: monotonic_ns +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +### Queries + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| `infer` for VPR backbone | 3 Hz | Yes | n/a | +| `infer` for LightGlue (×10 in C2.5, ×3 in C3) | 3 Hz × 13 = 39 Hz | Yes | n/a | +| `infer` for AdHoP (conditional) | <1 Hz typical | Yes (when invoked) | n/a | +| `thermal_state` poll | 1 Hz from C4 | No (sampled, not per-frame) | n/a | + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| Compiled `.engine` files | filesystem keyed by `(SM, JP, TRT, precision)` (D-C10-7) | bounded by JetPack/TRT version stability | manifest content-hash gate at takeoff (D-C10-3) | +| INT8 calibration cache | filesystem alongside `.engine` (D-C10-6) | bounded by calibration dataset version | rebuild when calibration dataset hash changes | +| Resident engine handles | GPU memory | flight lifetime | F8 reboot recovery re-deserialises | + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| `.engine` files | one per (model × precision × backbone) | 50 MB – 500 MB / engine | up to ~1.5 GB across all backbones for a deployment binary | bounded by AC-8.3 carve-out | +| INT8 calibration caches | one per engine | 1–10 MB | <50 MB | as above | + +### Data Management + +**Seed data**: pre-flight F1 provisioning compiles engines (or reuses cached). No mid-flight compilation. + +**Rollback**: D-C10-7 self-describing filename schema (`__sm_jp_trt_.engine`) makes stale engines visually obvious; F2 takeoff load refuses to deserialize an engine whose metadata doesn't match the host's current `(SM, JP, TRT)` tuple. + +## 5. Implementation Details + +**Algorithmic Complexity**: per-model forward pass cost is the design driver. Engine builds are `O(complexity_of_optimizer_search)` — minutes for INT8 with calibration; sub-minute for FP16. + +**State Management**: +- Owns the CUDA stream(s) for the runtime; one stream per concurrent consumer (typically one stream because the F3 hot path is single-threaded). +- Owns the resident engine handles for the duration of a flight. +- Owns the polling loop for thermal-throttle telemetry (1 Hz background thread). + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| TensorRT (C++ + Python) | 10.3 (JetPack 6.2 pin) per D-C7-9 | Primary engine compile + deserialize + infer | +| Polygraphy | matches TensorRT | Engine build orchestration | +| trtexec | bundled with TensorRT | Alternate engine build path | +| ONNX Runtime + TRT EP | per project pin | Fallback runtime | +| PyTorch | per simple-baseline pin | FP16 baseline (mandatory) | +| jetson-stats / pynvml | latest | Thermal-throttle telemetry source | + +**Error Handling Strategy**: +- `EngineBuildError`: surface to C10/operator pre-flight; takeoff blocked. **Never silently fall back** between runtimes — if the configured runtime can't build, the operator must explicitly switch. +- `EngineDeserializeError` at takeoff: refuse takeoff with explicit `(SM, JP, TRT, precision)` mismatch detail. +- `InferenceError` mid-flight (rare; e.g., transient CUDA fault): emit no result for that frame; the consumer (C2/C3) reports its own degraded path. +- `OutOfMemoryError`: same as above; surface to C13 FDR and C12 operator-tooling for post-flight investigation. +- `TelemetryUnavailableError`: jetson-stats hung or unavailable. Default to "thermal_throttle_active = false" (D-CROSS-LATENCY-1 stays on the steady-state path); log WARN. +- `CalibrationCacheError`: per D-C10-6, calibration cache trust is critical; if the cache hash mismatches, refuse to use it and force a rebuild. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `EngineFilenameSchema` | self-describing filename per D-C10-7 | C7, C10 | +| `Sha256Sidecar` | atomic write + content-hash sidecar pattern | C6, C7, C10 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- TensorRT engines are NOT portable across `(SM, JP, TRT, precision)` tuples; Tier-1 (workstation Docker) cannot reuse Tier-2 (Jetson) engines. CI emits both tiers' engines as artifacts. +- INT8 calibration cache trust is the lurking foot-gun; D-C10-6 manifest-hash gate is the only protection. Any deviation breaks NFT-PERF-01 / NFT-LIM-01. + +**Potential race conditions**: +- The thermal-throttle polling thread MUST be reentrant-safe with the F3 hot path's `infer` calls. Use a lock-free atomic snapshot for `thermal_state`. + +**Performance bottlenecks**: +- Per-frame inference cost is the F3 hot path's largest contributor. NFT-PERF-01 partition is the source of truth. + +## 8. Dependency Graph + +**Must be implemented after**: nothing internal — C7 is foundational. + +**Can be implemented in parallel with**: C6, C13. + +**Blocks**: C1 (CUDA strategies), C2, C2.5, C3, C3.5, C4 (consumes `ThermalState`), C10, F1, F2, F3, F6, F8. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `EngineBuildError`, `EngineDeserializeError`, `OutOfMemoryError`, `CalibrationCacheError` | `C7 OOM during infer; backbone=ultravpr; frame=12345` | +| WARN | thermal-throttle entered/exited; telemetry unavailable | `C7 thermal throttle active; gpu_temp=83C; clock=750mhz` | +| INFO | Strategy ready; engine deserialised; backbone resident | `C7 ready: runtime=tensorrt, engines=[ultravpr@fp16, lightglue@fp16, disk@fp16]` | +| DEBUG | per-frame infer timing per backbone | `C7 infer backbone=ultravpr frame=12345 took=37ms` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN always; thermal-state transitions always to FDR). diff --git a/_docs/02_document/components/09_c7_inference/tests.md b/_docs/02_document/components/09_c7_inference/tests.md new file mode 100644 index 0000000..da5a832 --- /dev/null +++ b/_docs/02_document/components/09_c7_inference/tests.md @@ -0,0 +1,170 @@ +# Test Specification — C7 On-Jetson Inference Runtime + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-4.1 | E2E latency <400 ms p95 | NFT-PERF-01 (Tier-2), **C7-PT-01** | Covered | +| AC-4.2 | Memory <8 GB on Jetson | NFT-LIM-01, **C7-PT-02** | Covered | +| AC-NEW-1 | Cold-start TTFF <30 s p95 | NFT-PERF-03, **C7-IT-01** | Covered | +| AC-NEW-5 | Operating envelope; thermal telemetry feed | NFT-LIM-04, **C7-IT-02** | Covered (workstation portion) | +| D-C10-3 | Manifest content-hash takeoff gate | (gate is C10-owned, but the engine deserialise call is C7) | **C7-IT-03** | Covered | +| D-C10-7 | Engine filename schema (SM/JP/TRT/precision) | Helper-doc cited; **C7-IT-04** | Covered | + +--- + +## Component-Internal Tests + +### C7-IT-01: cold-start engine load + warm-up budget + +**Summary**: from a cold (zero-resident-engines) Jetson process, every required engine deserialises and warms up in under the AC-NEW-1 30 s p95 budget. + +**Traces to**: AC-NEW-1 + +**Description**: kill the companion process; restart; measure wall-clock from process start to "all engines warm" event in the FDR record stream. Repeat 10 times; assert p95 ≤ 30 s. + +**Input data**: pre-built engine cache for the Derkachi fixture profile. + +**Expected result**: p95 ≤ 30 s; no engine fails to warm. + +**Max execution time**: 6 min (10 × ~30 s + overhead). + +--- + +### C7-IT-02: thermal telemetry feeds C4's hybrid + +**Summary**: `ThermalState` from `jetson-stats` is published at ≥1 Hz and is observable to C4; under simulated throttle, `throttle == true` is reported within 1 s of the throttle event. + +**Traces to**: AC-NEW-5 (workstation-baseline portion; chamber portion deferred per traceability matrix) + +**Description**: simulate a thermal-throttle event by spoofing the `jetson-stats` sysfs reading; assert (a) `ThermalState` updates carry `throttle == true` within 1 s, (b) C4's `current_covariance_mode` flips to JACOBIAN within 1 frame after that. + +**Input data**: scripted sysfs spoof. + +**Expected result**: 1 s telemetry latency; 1-frame C4 reaction. + +**Max execution time**: 30 s. + +--- + +### C7-IT-03: D-C10-3 takeoff gate refuses mismatched engine + +**Summary**: when the manifest's content-hash for an engine does not match the on-disk engine's hash, C7 refuses to deserialise and the F2 takeoff aborts. + +**Traces to**: D-C10-3 + +**Description**: corrupt one byte of a deployed engine after the manifest has been signed; trigger F2 takeoff load; assert (a) C7 raises `EngineHashMismatchError`, (b) the airborne process refuses to open the FC adapter, (c) the failure is logged at ERROR. + +**Input data**: a deployed engine + its corrupted twin. + +**Expected result**: takeoff aborts; ERROR logged. + +**Max execution time**: 30 s. + +--- + +### C7-IT-04: SM / JetPack / TRT / precision filename schema enforcement + +**Summary**: an engine file whose `///` quadruple in the filename does not match the running Jetson's actual quadruple is refused at deserialise time. + +**Traces to**: D-C10-7 + +**Description**: copy a valid engine file but rename it with a mismatched SM (e.g., `sm86` instead of `sm87`); call `load_engine`; assert `EngineSchemaMismatchError` and no GPU memory allocated. + +**Input data**: a valid engine + a renamed copy. + +**Expected result**: engine refused at filename-parse time. + +**Max execution time**: 5 s. + +--- + +### C7-IT-05: ONNX-RT fallback when TRT engine unavailable + +**Summary**: if the primary TRT engine is missing or unloadable, C7 falls back to ONNX-RT + TRT-EP and continues without dropping the request. + +**Traces to**: defensive (engine-rule simple-baseline path) + +**Description**: rename the TRT engine for one model away (so deserialise fails); call `infer`; assert the call succeeds via ONNX-RT path with a degraded-latency warning logged. + +**Input data**: TRT engine + ONNX model side-by-side. + +**Expected result**: successful inference; degraded-latency warning. + +**Max execution time**: 30 s. + +--- + +## Performance Tests + +### C7-PT-01: per-call inference latency p95 by model + +**Traces to**: AC-4.1 + +**Load scenario**: scripted call rate matching production — UltraVPR @ 3 Hz, LightGlue @ 9 Hz (3 cands × 3 Hz), AdHoP conditional (~25%). + +**Expected results**: + +| Model | Mode | p95 latency target | Failure threshold | +|-------|------|--------------------|-------------------| +| UltraVPR | TRT FP16 | ≤ 60 ms | 100 ms | +| LightGlue | TRT FP16 | ≤ 30 ms | 60 ms | +| AdHoP | TRT FP16 | ≤ 90 ms | 150 ms | +| DISK | TRT FP16 | ≤ 50 ms | 90 ms | + +--- + +### C7-PT-02: aggregate GPU memory budget + +**Traces to**: AC-4.2 + +**Load scenario**: all production-default engines resident concurrently. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| GPU resident memory (all engines) | ≤ 4 GB | 5 GB | +| System RAM (process resident) | ≤ 1.5 GB | 2 GB | + +(remaining 8 GB shared LPDDR5 budget partition belongs to OS + ROS-equivalents + scratch; tracked at the system level by NFT-LIM-01.) + +--- + +## Security Tests + +### C7-ST-01: engine deserialise refuses files with no SHA-256 sidecar + +**Summary**: per Helper `Sha256Sidecar`, every engine has a sidecar `.sha256` file; deserialising an engine without one is refused. + +**Traces to**: D-C10-3 (defensive) + +**Test procedure**: +1. Delete the sidecar for one valid engine. +2. Call `load_engine` on it. +3. Assert refusal with `EngineSidecarMissingError`. + +**Pass criteria**: refusal + no GPU memory allocated. +**Fail criteria**: load succeeds. + +--- + +## Acceptance Tests + +C7 has no operator-facing behaviour; covered transitively via NFT-PERF-01 / NFT-PERF-03. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Pre-built engine cache for Derkachi profile | C10 build artifact | ~600 MB | +| Spoofed `jetson-stats` sysfs harness | scripted | <1 MB | +| Corrupted-engine fixture | scripted | varies | + +**Setup**: C10 must have built engines for SM 87 / JP 6.2 / TRT 10.3 / FP16 once before C7 tests can run on Tier-2. +**Teardown**: read-only. +**Data isolation**: per-test temp dirs. diff --git a/_docs/02_document/components/10_c8_fc_adapter/description.md b/_docs/02_document/components/10_c8_fc_adapter/description.md new file mode 100644 index 0000000..5935498 --- /dev/null +++ b/_docs/02_document/components/10_c8_fc_adapter/description.md @@ -0,0 +1,153 @@ +# C8 — Flight-Controller Adapter + +## 1. High-Level Overview + +**Purpose**: own the per-FC inbound + outbound communication. Inbound: subscribe to FC IMU/attitude/GPS-health/MAV_STATE telemetry; publish `ImuWindow` / `AttitudeWindow` / `GpsHealth` / `FlightStateSignal` for upstream consumers. Outbound: encode `EstimatorOutput` into the per-FC external-position contract (`GPS_INPUT` for ArduPilot Plane, `MSP2_SENSOR_GPS` for iNav) at 5 Hz periodic with honest 6×6 → 2×2 covariance projection. Owns MAVLink 2.0 message signing on the AP wired channel (D-C8-9 = (d)) and the per-flight key rotation. Also feeds the GCS link with downsampled telemetry (1–2 Hz per AC-6.1). + +**Architectural Pattern**: Strategy — `FcAdapter` interface with two concrete implementations: `PymavlinkArdupilotAdapter`, `Msp2InavAdapter`. Plus a `GcsAdapter` (single concrete `QgcTelemetryAdapter` today). All selected at startup by config (ADR-001), build-time gating per `BUILD_*` flags (ADR-002, both adapters typically linked into the deployment binary so a single image can target both FCs by configuration), composition-root wired (ADR-009). + +**Upstream dependencies**: +- C5 StateEstimator → `EstimatorOutput` (5 Hz periodic emit driver). +- Hardware: UART/USB to FC; UART (or USB) to GCS (often shared or via FC mavlink-routing). + +**Downstream consumers**: +- C5 StateEstimator (consumes `ImuWindow`, `AttitudeWindow`, `GpsHealth`, `FlightStateSignal`). +- C1 VIO (consumes `ImuWindow`). +- C13 FDR (consumes raw inbound + emitted outbound MAVLink/MSP2 streams; signing key rotation events; spoof-promotion events). +- C11 `TileUploader` (consumes `FlightStateSignal == ON_GROUND` confirmation; runs on a different process / image, so the signal flows out-of-band via the FDR or a small bus the operator tool subscribes to post-flight). The C11 `TileDownloader` does NOT depend on `FlightStateSignal` — it runs pre-flight when the companion is plugged into the operator workstation. + +## 2. Internal Interfaces + +### Interface: `FcAdapter` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `open` | `port_config: PortConfig, signing_key: bytes (AP only)` | `None` | No | `FcOpenError`, `SigningHandshakeError` | +| `subscribe_telemetry` | `callback: Callable[[FcTelemetryFrame], None]` | `Subscription` | No | — | +| `emit_external_position` | `EstimatorOutput` | `None` | No | `FcEmitError` | +| `emit_status_text` | `string, severity: enum {INFO, WARN, ERROR}` | `None` | No | `FcEmitError` | +| `request_source_set_switch` (AP only) | `()` | `None` | No | `SourceSetSwitchError` | +| `current_flight_state` | `()` | `FlightStateSignal` | No | — | + +### Interface: `GcsAdapter` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `emit_summary` | `EstimatorOutput` (downsampled to 1–2 Hz) | `None` | No | `GcsEmitError` | +| `subscribe_operator_commands` | `callback` | `Subscription` | No | — | + +**Input/Output DTOs**: +``` +PortConfig: + device: string (e.g. /dev/ttyTHS1) + baud: int + fc_kind: enum {ardupilot_plane, inav} + +FcTelemetryFrame: + kind: enum {imu_sample, attitude, gps_health, mav_state} + payload: union per kind + received_at: monotonic_ns + signed: bool — true only for AP signed frames + +EmittedExternalPosition: + per_fc_payload: enum (GPS_INPUT for AP / MSP2_SENSOR_GPS for iNav) + horiz_accuracy_m: float (AP) — projection of covariance_6x6 → 2x2 → equivalent_radius + hPosAccuracy_mm: int (iNav) — same source, mm units (D-C8-8 = (b)) + source_label: STATUSTEXT / NAMED_VALUE_FLOAT (out-of-band) +``` + +## 3. External API Specification + +C8 exposes no HTTP/gRPC API. Its **external** surfaces are: + +- **MAVLink 2.0 messages** to/from ArduPilot Plane, signed (D-C8-9 = (d)). Specifically: + - In: `RAW_IMU` / `SCALED_IMU2`, `ATTITUDE`, `GPS_RAW_INT`, `GPS2_RAW`, `HEARTBEAT`, `STATUSTEXT`. + - Out: `GPS_INPUT` (5 Hz), `STATUSTEXT`, `NAMED_VALUE_FLOAT` (provenance label), `COMMAND_LONG MAV_CMD_SET_EKF_SOURCE_SET` (D-C8-2 source-set switch on F7). +- **MSP2 messages** to/from iNav over UART (unsigned, accepted residual risk): + - In: `MSP2_INAV_ANALOG`, attitude+IMU stream. + - Out: `MSP2_SENSOR_GPS` (5 Hz). MAVLink outbound for telemetry only (iNav speaks both). +- **MAVLink 2.0 messages** to/from QGroundControl on the GCS link (1–2 Hz downsampled summary out, operator commands in). + +Refer to `architecture.md` § 5 External Integrations for the per-message rate/auth/failure-mode table; this spec inherits those values. + +## 4. Data Access Patterns + +C8 holds no persistent storage. All communication is in-memory streaming. + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| Per-flight signing key (AP) | in-memory, generated at takeoff | flight lifetime | new key on next takeoff | +| Last-emitted external position (for downsampling 5 Hz → 1–2 Hz GCS) | in-memory ring | flight lifetime | n/a | +| FC telemetry rolling window (for `ImuWindow`, `AttitudeWindow` consumers) | in-memory bounded ring | bounded by C1/C5 consumption | drop-oldest | + +## 5. Implementation Details + +**Algorithmic Complexity**: per-message encode/decode is `O(message_size)`; the per-frame work is dominated by the covariance projection (3×3 sub-matrix → equivalent_radius), which is constant-time. + +**State Management**: +- AP path: holds the MAVLink 2.0 signing state (key, key generation timestamp, signature counter). +- iNav path: holds the MSP2 channel state (sequence numbers). +- Both paths: hold the bounded telemetry rings and a backpressure budget for the 5 Hz outbound emit. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| pymavlink | bundled unmodified per D-C8-3 | AP MAVLink 2.0 + signing | +| YAMSPy | per project pin | iNav MSP2 | +| INAV-Toolkit | per project pin | iNav MSP2 message definitions for `MSP2_SENSOR_GPS` | +| pyserial | per project pin | UART transport | + +**Error Handling Strategy**: +- `FcOpenError` at takeoff: refuse takeoff with explicit error. +- `SigningHandshakeError` at takeoff (AP): refuse takeoff. Mid-flight signing failure: FC ignores unsigned messages, AC-5.2 takes over (3 s no estimate → FC IMU-only). Companion logs the rotation/expiry event to FDR. +- `FcEmitError` mid-flight: log + continue; the FC's own staleness detection will eventually trigger AC-5.2. +- `SourceSetSwitchError` (AP only, F7): try once; on failure log + STATUSTEXT to GCS; the system continues to emit `GPS_INPUT` and the operator can manually switch via RC aux per D-C8-2-FALLBACK. +- iNav unsigned: NOT an error — accepted residual risk per Mode B Source #129. Documented in `tests/security-tests.md` NFT-SEC-03. + +**Covariance projection** (D-C8-8 = (b)): +- Project 6×6 GTSAM `Marginals` covariance → 3×3 position sub-matrix → 2×2 horizontal sub-matrix → `equivalent_radius` per per-FC contract: + - AP `horiz_accuracy` (m): `sqrt(0.5 * (sigma_xx + sigma_yy + sqrt((sigma_xx - sigma_yy)^2 + 4*sigma_xy^2)))` (largest eigenvalue of 2×2; documented as the "honest covariance projection" in IT-10). + - iNav `hPosAccuracy` (mm): same value, converted to mm. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `WgsConverter` | shared with C4, C5, C6 | C4, C5, C6, C8 | +| `CovarianceProjector` | 6×6 → 2×2 → equivalent_radius — keep inside C8 (per-FC unit conversion is variant-specific per coderule SRP) | C8 only | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- iNav has no signing equivalent; accepted residual risk per Mode B Source #129. Plan-phase carryforward proposes an iNav firmware feature request — out of scope for this cycle. +- D-C8-2 source-set switch is firmware-supported but not deployed-precedent; ADR-008 makes IT-3 (ArduPilot SITL validation) the lock gate. + +**Potential race conditions**: +- The 5 Hz outbound emit thread MUST not block on the 100–200 Hz inbound telemetry decode thread. Bounded rings + drop-oldest semantics on the inbound side; outbound-emit timer is independent. +- The AP signing key rotation MUST happen between flights, not mid-flight. The key generation is at takeoff load (F2). + +**Performance bottlenecks**: +- pymavlink's signing handshake is sub-second on a wired UART/USB link. F2 budget is generous. + +## 8. Dependency Graph + +**Must be implemented after**: hardware UART config (operator workstation pre-flight stage). + +**Can be implemented in parallel with**: C6, C13. + +**Blocks**: C1 (FC IMU prior), C5 (factor inputs), F2 / F3 / F5 / F7 / F8 / F9 / F10. + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `FcOpenError`, `SigningHandshakeError`, persistent `FcEmitError` | `C8 AP signing handshake failed; refusing takeoff` | +| WARN | one-off `FcEmitError`; spoofing-promotion blocked event surfaced via STATUSTEXT | `C8 GPS_INPUT emit dropped; reason=uart_busy; will retry next tick` | +| INFO | adapter ready; key rotation event; F7 source-set switch executed | `C8 AP signing key rotated; flight_id=…; key_age_s=0` | +| DEBUG | per-emit timing | `C8 GPS_INPUT emit frame=12345 took=2.1ms; horiz_accuracy_m=4.2` | + +**Log format**: structured JSON. +**Log storage**: stdout / journald / FDR via C13 (ERROR + WARN always; AP signing key rotation events ALWAYS to FDR; F7 source-set switch events ALWAYS to FDR + GCS STATUSTEXT; STATUSTEXT broadcasts ALWAYS mirrored to FDR). diff --git a/_docs/02_document/components/10_c8_fc_adapter/tests.md b/_docs/02_document/components/10_c8_fc_adapter/tests.md new file mode 100644 index 0000000..e20ecec --- /dev/null +++ b/_docs/02_document/components/10_c8_fc_adapter/tests.md @@ -0,0 +1,224 @@ +# Test Specification — C8 FC + GCS Adapter + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-4.3 | FC output: GPS_INPUT (AP) + MSP2_SENSOR_GPS (iNav) with honest covariance | FT-P-03, FT-P-09-AP, FT-P-09-iNav, **C8-IT-01** | Covered | +| AC-4.4 | Estimates streamed frame-by-frame | NFT-PERF-02, **C8-IT-02** | Covered | +| AC-5.1 | Init from FC EKF's last valid GPS + IMU-extrapolated | FT-P-11, **C8-IT-03** | Covered | +| AC-5.2 | On >3 s without estimate, FC IMU-only fallback; SUT logs | NFT-RES-01 | Covered (C8 emission path) | +| AC-6.1 | GCS stream at 1–2 Hz | FT-P-12, **C8-IT-04** | Covered | +| AC-6.2 | GCS may send commands via standard MAVLink | FT-P-13, **C8-IT-05** | Covered | +| AC-6.3 | WGS84 output | FT-P-14, **C8-IT-06** | Covered | +| AC-NEW-2 | Spoofing-promotion latency <3 s p95 (source-set switch) | NFT-PERF-04, **C8-IT-07** | Covered | +| RESTRICT-COMM-2 | iNav has no MAVLink signing; documented asymmetry | NFT-SEC-03, **C8-IT-08** | Covered | +| D-C8-9 = (d) | MAVLink 2.0 per-flight signing on AP | IT-3, **C8-ST-01** | Open (gated by IT-3 SITL pass) | + +--- + +## Component-Internal Tests + +### C8-IT-01: per-FC encoder produces honest 6×6 → 2×2 covariance projection + +**Summary**: `EstimatorOutput.covariance_6x6`'s position 3×3 block projects to the FC's 2×2 horizontal covariance with the same Frobenius norm scale (no fake-confidence regression). + +**Traces to**: AC-4.3 + +**Description**: 100 emitted `EstimatorOutput` records with varying covariances; capture the encoded `GPS_INPUT` bytes; decode via pymavlink; assert the FC-side 2×2 covariance Frobenius norm equals the source 3×3 horizontal-block norm to within 1% (small numerical tolerance for unit conversion). Same assertion against `MSP2_SENSOR_GPS` for the iNav adapter. + +**Input data**: scripted `EstimatorOutput` sequence with controlled covariances. + +**Expected result**: norm equality within 1% for both adapters. + +**Max execution time**: 30 s. + +--- + +### C8-IT-02: 5 Hz periodic emission stays within ±5% of nominal interval + +**Summary**: `emit_external_position` runs at exactly 5 Hz with jitter ≤ ±5%. + +**Traces to**: AC-4.4 + +**Description**: drive the emit path for 60 s; record emission timestamps; compute inter-emission interval distribution; assert mean = 200 ms ± 1 ms; std-dev ≤ 10 ms; max gap ≤ 220 ms. + +**Input data**: scripted `EstimatorOutput` source at 5 Hz. + +**Expected result**: jitter within bound. + +**Max execution time**: 90 s. + +--- + +### C8-IT-03: warm-start GPS from FC EKF feeds C5 + +**Summary**: at takeoff, the FC EKF's last-valid-GPS is read by C8 and surfaced via `current_flight_state` for C5's warm-start. + +**Traces to**: AC-5.1 + +**Description**: stage a SITL with a known last-valid-GPS; bring up C8; call `current_flight_state`; assert `FlightStateSignal` includes the EKF's pose hint with timestamp ≤ 1 s old. + +**Input data**: ArduPilot SITL with scripted EKF state. + +**Expected result**: warm-start hint exposed within 1 s of C8 ready. + +**Max execution time**: 60 s. + +--- + +### C8-IT-04: 1–2 Hz GCS downsampling + +**Summary**: GCS stream is downsampled from C5's 5 Hz to 1–2 Hz per AC-6.1. + +**Traces to**: AC-6.1 + +**Description**: drive 5 Hz `EstimatorOutput`; capture GCS-side stream; assert rate is between 1 and 2 Hz with consistent downsample factor. + +**Input data**: scripted source. + +**Expected result**: 1–2 Hz GCS stream. + +**Max execution time**: 60 s. + +--- + +### C8-IT-05: GCS commands are accepted and routed + +**Summary**: standard MAVLink commands sent from QGC reach C8 and the system handles them per AC-6.2 (e.g., parameter read, mode set). + +**Traces to**: AC-6.2 + +**Description**: connect QGC SITL to the C8 GCS adapter; send a parameter-read; assert the response is well-formed and the parameter values match the running config. + +**Input data**: QGC SITL + C8 adapter. + +**Expected result**: parameter read returns values. + +**Max execution time**: 60 s. + +--- + +### C8-IT-06: WGS84 conversion round-trip identity + +**Summary**: `WgsConverter` (helper) round-trips a local-tangent-plane → WGS84 → local-tangent-plane within 1 cm position residual. + +**Traces to**: AC-6.3 (defensive — backstops the helper that C4 + C8 both rely on) + +**Description**: 1000 random local-tangent-plane points; round-trip through the helper; assert max residual < 1 cm position; max angular residual < 0.001°. + +**Input data**: scripted random points within ±10 km of an origin. + +**Expected result**: round-trip residual within bound. + +**Max execution time**: 5 s. + +--- + +### C8-IT-07: source-set switch latency under spoof event + +**Summary**: when the spoof gate clears, `request_source_set_switch` fires the AP `MAV_CMD_SET_EKF_SOURCE_SET` within the AC-NEW-2 3 s budget. + +**Traces to**: AC-NEW-2 + +**Description**: scripted scenario from C5-IT-06 (spoof recovery); assert C8 issues the `MAV_CMD_SET_EKF_SOURCE_SET` within 3 s of the gate-clear event; assert SITL acknowledges the command. + +**Input data**: ArduPilot SITL + scripted gate-clear. + +**Expected result**: command issued + ACK within 3 s. + +**Max execution time**: 60 s. + +--- + +### C8-IT-08: iNav signing-asymmetry assertion + +**Summary**: the iNav adapter never attempts MAVLink signing (iNav doesn't support it per RESTRICT-COMM-2); the AP adapter always does. + +**Traces to**: RESTRICT-COMM-2 + +**Description**: bring up the iNav adapter; assert no signing handshake is attempted (capture wire bytes, check no MAVLink2 signed-flag set). Bring up AP adapter; assert signing handshake completes. + +**Input data**: iNav SITL + AP SITL. + +**Expected result**: per the assertion above. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C8-PT-01: emission latency under load + +**Traces to**: AC-4.4 / AC-4.1 (emission portion) + +**Load scenario**: 5 Hz emit + 200 Hz inbound IMU subscribe + GCS at 2 Hz; 10 min replay. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `emit_external_position` p95 | ≤ 5 ms | 15 ms | +| Inbound IMU callback p95 | ≤ 1 ms | 5 ms | + +--- + +## Security Tests + +### C8-ST-01: MAVLink 2.0 per-flight signing handshake (D-C8-9 = (d), gated by IT-3) + +**Summary**: at takeoff, C8 (AP adapter) generates a per-flight ephemeral key and completes the MAVLink 2.0 signing handshake with ArduPilot SITL. + +**Traces to**: D-C8-9 = (d) (R03 risk) + +**Test procedure**: +1. Bring up SITL with no pre-shared key. +2. C8 generates a fresh per-flight key, completes handshake. +3. Send 100 signed messages; assert SITL accepts them. +4. Replay one of those messages with a tampered signature; assert SITL rejects it. + +**Pass criteria**: handshake succeeds; tampered messages rejected. +**Fail criteria**: handshake fails (escalate to D-C8-2-FALLBACK per ADR-008) OR tampered messages accepted. + +**Status**: gated by IT-3 SITL pass; this test IS the gate. + +--- + +### C8-ST-02: signing key never persists across flights + +**Summary**: per-flight ephemeral key is destroyed in memory at landing; never written to disk. + +**Traces to**: defensive (D-C8-9 = (d)) + +**Test procedure**: +1. Run a flight; capture the key in memory at takeoff via test harness. +2. Trigger landing. +3. Inspect process memory + disk for any residue of the key. +4. Assert no on-disk persistence; in-memory residue cleared by the configured zeroisation routine. + +**Pass criteria**: no on-disk + memory zeroised. +**Fail criteria**: any persistence. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-09-AP / FT-P-09-iNav / FT-P-12 / FT-P-13 / FT-P-14. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| ArduPilot SITL fixture (Plane) | upstream SITL Docker | image | +| iNav SITL fixture | upstream SITL Docker | image | +| QGC SITL | upstream Docker | image | +| Scripted `EstimatorOutput` sequences | scripted | <5 MB each | + +**Setup**: SITL containers must be running; C8 connects via simulated UART. +**Teardown**: stop containers; clean ephemeral keys. +**Data isolation**: per-test SITL container. diff --git a/_docs/02_document/components/11_c10_provisioning/description.md b/_docs/02_document/components/11_c10_provisioning/description.md new file mode 100644 index 0000000..cb14793 --- /dev/null +++ b/_docs/02_document/components/11_c10_provisioning/description.md @@ -0,0 +1,151 @@ +# C10 — Pre-flight Cache Provisioning + +## 1. High-Level Overview + +**Purpose**: build the **model-derived** pre-flight cache artifacts on top of an already-populated tile store, and verify them at takeoff. After C11 `TileDownloader` has fetched tiles into C6, C10 orchestrates: compile/deserialize TensorRT engines via C7 → batch each tile through C2's backbone for descriptors → atomically write FAISS HNSW index with SHA-256 sidecars (D-C10-3) → write Manifest with hash of (model + calibration + corpus + sector_class) for D-C10-1 idempotence. At F2 takeoff load, run `verify_manifest` (D-C10-3 SHA-256 content-hash gate) before allowing the system to arm. + +**C10 does NOT touch `satellite-provider`.** Tile I/O — both download (F1 inbound) and post-landing upload (F10) — lives in C11 (Tile Manager). C10 reads tiles from C6, writes engines + descriptors + manifest to filesystem and Postgres. The split is operational: C11 carries the operator-side network identity (TLS API key for download, per-flight signing key for upload) and the airborne-exclusion property (ADR-004); C10 carries the model identity and the takeoff-load verifier — neither of which need to leave the workstation/companion enclave at runtime. + +**Architectural Pattern**: Coordinator — single concrete implementation `CacheProvisioner` behind two interfaces (`CacheProvisioner` for the F1 build phase, `ManifestVerifier` for F2's content-hash gate). The interfaces are split because F2 only needs the verifier and shouldn't pull in the full provisioning code path. + +**Upstream dependencies**: + +- C12 OperatorTooling → triggers `build_cache_artifacts(...)` after C11 `TileDownloader` has populated C6. +- C6 TileStore + TileMetadataStore + DescriptorIndex → read source (tiles + metadata), write target (FAISS index). +- C7 InferenceRuntime → engine compile + deserialize. +- C2 backbone (via C7 engine) → descriptor batched generation. + +**Downstream consumers**: + +- F2 takeoff load → consumes `verify_manifest` outcome. + +## 2. Internal Interfaces + +### Interface: `CacheProvisioner` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `build_cache_artifacts` | `BuildRequest` | `BuildReport` | No (offline; minutes) | `EngineBuildError`, `DescriptorBatchError`, `ManifestWriteError`, `IdempotentNoOp` | +| `compile_engines_for_corpus` | `BackboneList` | `list[EngineCacheEntry]` | No | `EngineBuildError`, `CalibrationCacheError` | + +### Interface: `ManifestVerifier` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `verify_manifest` | `manifest_path: Path` | `VerificationResult` | No | `ManifestNotFoundError`, `ContentHashMismatchError` | + +**Input/Output DTOs**: + +``` +BuildRequest: + bbox: BoundingBox (lat_min, lon_min, lat_max, lon_max) # scopes which C6 tiles are in the manifest + zoom_levels: list[int] + sector_class: enum {active_conflict, stable_rear} # baked into manifest + calibration_path: Path + cache_root: Path + +BuildReport: + engines_built: int + engines_reused: int + descriptors_generated: int + manifest_hash: sha256 + outcome: enum {success, failure, idempotent_no_op} + failure_reason: string (optional) + +Manifest: see data_model.md +EngineCacheEntry: see data_model.md + +VerificationResult: + manifest_hash_match: bool + per_artifact_hash_match: dict[Path, bool] + outcome: enum {pass, fail} + fail_reasons: list[string] +``` + +## 3. External API Specification + +Not applicable. C10 has no network surface — all I/O is local filesystem + local Postgres. + +## 4. Data Access Patterns + +C10 reads `tiles` rows from C6 (scoped to the build's bbox + zoom_levels), writes the FAISS `.index` to filesystem via `Sha256Sidecar`, and writes Manifest + `manifests` row to Postgres via C6. + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| Manifest | one per build per cached area | ~10 KB (YAML/JSON) | negligible | per build | +| SHA-256 sidecars | one per artifact (.index, calibration JSON, manifest, .engine) | 64 B (hex digest) | negligible | per build | + +### Data Management + +**Seed data**: none — C10 writes from scratch (or D-C10-1 idempotently no-ops). Tiles must already be in C6 (placed there by C11 `TileDownloader`); a missing-tiles condition is a build error, not a download trigger. + +**Rollback**: D-C10-1 manifest-hash check makes provisioning idempotent. Atomic writes (atomicwrites package) prevent partial states; on partial failure, the previous-good cache remains until the new one is fully written. + +## 5. Implementation Details + +**Algorithmic Complexity**: dominated by descriptor batched generation on Jetson (GPU-bound). Worst-case ~400 km² provisioning is ≤ tens of minutes (offline, not time-critical per AC-8.3). Tile network bandwidth is **not** in C10's budget — that cost is in C11. + +**State Management**: stateless w.r.t. flight lifetime. No connection state — all dependencies are local. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| atomicwrites | latest | Atomic file replacement for `.index` + Manifest (D-C10-3) | +| hashlib (stdlib) | stdlib | SHA-256 content-hash sidecars | +| PyYAML / orjson | per project pin | Manifest serialization | + +**Error Handling Strategy**: + +- `EngineBuildError` / `CalibrationCacheError`: surfaced from C7 — never silently fall back; operator must intervene. +- `DescriptorBatchError`: CUDA OOM during descriptor generation. Halve batch size and retry once; if still OOM, surface to operator. +- `ManifestWriteError`: filesystem error or atomic-write rollback. Cache marked invalid; operator must re-run. +- `IdempotentNoOp`: D-C10-1 manifest-hash matched the prior build's hash; skip rebuild; emit no-op report. +- `ContentHashMismatchError` (F2): refuse takeoff; STATUSTEXT to GCS; FDR records the event; operator must re-run F1. +- **Missing tiles in C6 for the requested bbox/zoom**: surface as `BuildReport.failure` with explicit instruction to run C11 `TileDownloader` first; do **not** fall back to a network fetch — that responsibility lives in C11. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `Sha256Sidecar` | atomic write + content-hash sidecar pattern | C6, C7, C10 | +| `EngineFilenameSchema` | self-describing filename per D-C10-7 | C7, C10 | +| `WgsConverter` | bbox math | C4, C5, C6, C8, C10 | + +## 7. Caveats & Edge Cases + +**Known limitations**: + +- C10 depends on C6 already containing the tiles for the requested bbox + zoom levels. The F1 cache-build workflow (C12) sequences `C11 TileDownloader → C10 build_cache_artifacts`; C10 alone is not a complete F1. +- D-C10-3 SHA-256 content-hash gate must cover EVERY artifact: every tile (the per-tile hash is computed at C11 download time and stored in C6), the FAISS `.index`, the calibration JSON, and the Manifest itself. Missing sidecars are a release-blocking defect. + +**Potential race conditions**: + +- Concurrent `build_cache_artifacts` invocations on the same cache root would corrupt state. Single-process operator-tool wraps with a filesystem lockfile (the same lockfile C11 honours); if a second invocation tries to start, fail with explicit error. + +**Performance bottlenecks**: + +- Descriptor batched generation is GPU-bound; batching is the main lever (D-C7-1 INT8/FP16 mix decision applies). +- Engine compile is workspace-bound on Jetson; D-C10-6 calibration cache reuse is the main lever. + +## 8. Dependency Graph + +**Must be implemented after**: C6 (read source for tiles, write target for FAISS), C7 (engine + descriptor runtime), C2 (backbone interface for descriptor generation; called via C7). + +**Can be implemented in parallel with**: C8, C13. + +**Blocks**: C12 (operator can't sequence F1 without C10 ready), F1, F2 (verify_manifest), F8 (warm-cache verify on reboot recovery). + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `EngineBuildError`, `DescriptorBatchError`, `ManifestWriteError`, `ContentHashMismatchError` (F2) | `C10 engine build failed: backbone=disk; takeoff blocked` | +| WARN | engine cache miss falls through to build | `C10 engine cache miss: model=ultra_vpr; sm=87, jp=6.2, trt=10.3, fp16; rebuild` | +| INFO | Build start/end + report; verify_manifest pass | `C10 build complete: engines=4, descriptors=87654, manifest_hash=…; outcome=success` | +| DEBUG | per-tile descriptor batch progress | `C10 descriptor batch progress: 12345/87654 (14%)` | + +**Log format**: structured JSON. +**Log storage**: stdout (operator tool); journald (companion verify); FDR via C13 (only for F2 verify_manifest events — provisioning is offline and goes to operator-facing logs, not flight FDR). diff --git a/_docs/02_document/components/11_c10_provisioning/tests.md b/_docs/02_document/components/11_c10_provisioning/tests.md new file mode 100644 index 0000000..6aa8e63 --- /dev/null +++ b/_docs/02_document/components/11_c10_provisioning/tests.md @@ -0,0 +1,151 @@ +# Test Specification — C10 Pre-flight Cache Provisioning (engines + descriptors + manifest) + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. C10 was narrowed in this Plan cycle: it builds model-derived artifacts (TensorRT engines, VPR descriptors, signed Manifest) from an **already-populated** C6 tile cache. Tile fetch is C11 `TileDownloader`'s concern. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-8.3 | Imagery pre-loaded onto companion before flight (the manifest gate) | FT-P-15, FT-P-16, **C10-IT-01** | Covered | +| AC-NEW-1 | Cold-start TTFF <30 s p95 (pre-built engines required) | NFT-PERF-03, **C10-IT-02** | Covered | +| D-C10-1 | Manifest-hash idempotence on repeated build | **C10-IT-03** | Covered | +| D-C10-3 | Takeoff content-hash gate refuses mismatch | covered at C7-IT-03; **C10-IT-04** asserts the manifest signing path | Covered | +| D-C10-6 | Engine cache hardware-tied (SM 87 / JP 6.2 / TRT 10.3 / FP16) | C7-IT-04, **C10-IT-05** | Covered | +| D-C10-7 | Engine filename schema enforcement | covered at C7-IT-04 | Covered | + +--- + +## Component-Internal Tests + +### C10-IT-01: end-to-end build from a pre-populated C6 + +**Summary**: given a C6 tile cache populated by C11 `TileDownloader` (10 GB Derkachi area), C10 produces (a) all required TensorRT engines, (b) the FAISS HNSW index over VPR descriptors, (c) a signed Manifest, in under the operator-tooling time budget. + +**Traces to**: AC-8.3, AC-NEW-1 + +**Description**: stage a C6 with the Derkachi corpus already populated; run `CacheProvisioner.build_artifacts`; assert (a) the engine set under `cache_artifacts/engines/` matches the configured model list, (b) `descriptor_index.faiss` is non-empty and queryable, (c) the Manifest is signed with the operator's signing key and content-hashes every artifact. + +**Input data**: pre-populated C6 (`tests/fixtures/c6_populated_derkachi/`). + +**Expected result**: all artifacts present + signed Manifest. + +**Max execution time**: 12 min on Tier-1 (CPU TRT compile is slow; Tier-2 takes ~4 min and is the production path). + +--- + +### C10-IT-02: ManifestVerifier refuses unsigned / wrong-signature Manifest + +**Summary**: `ManifestVerifier.verify` rejects a Manifest whose signature doesn't validate against the operator's public key. + +**Traces to**: AC-NEW-1, D-C10-3 + +**Description**: build a valid Manifest; copy it; tamper one byte; call `verify`; assert `ManifestSignatureError`. Repeat: copy + replace signature with one signed by an unauthorized key; assert `ManifestSignatureError`. + +**Input data**: valid Manifest + 2 tampered copies. + +**Expected result**: both tampered Manifests rejected. + +**Max execution time**: 5 s. + +--- + +### C10-IT-03: idempotence on repeated build + +**Summary**: re-running `build_artifacts` against an unchanged C6 produces the same Manifest content-hash and skips already-built engines. + +**Traces to**: D-C10-1 + +**Description**: run build once; record Manifest content-hash + engine compile timestamps. Re-run with no C6 changes; assert (a) Manifest content-hash unchanged, (b) engines reused (no recompile, asserted via timestamp comparison), (c) total wall-clock < 1 min on Tier-1. + +**Input data**: as C10-IT-01. + +**Expected result**: idempotent — same hash, no recompile. + +**Max execution time**: 90 s (second-run only). + +--- + +### C10-IT-04: Manifest covers every shipped artifact + +**Summary**: the Manifest's content-hash table includes every file under `cache_artifacts/`; an artifact present on disk but missing from the Manifest is a build failure. + +**Traces to**: D-C10-3 (no smuggled artifacts can pass the takeoff gate) + +**Description**: after a successful build, plant an extra file in `cache_artifacts/`; re-run `build_artifacts` (or call the build's post-step audit hook); assert build refuses to sign — output `ManifestCoverageError` listing the orphan file. + +**Input data**: as C10-IT-01 plus an extra file. + +**Expected result**: build fails with `ManifestCoverageError`. + +**Max execution time**: 60 s. + +--- + +### C10-IT-05: Tier-2 hardware-tied engine compile produces SM-87 / JP-6.2 / TRT-10.3 binary + +**Summary**: when run on the bench Jetson, C10 produces engines whose internal TRT metadata reports `SM=87, JetPack=6.2, TRT=10.3, precision=FP16`. + +**Traces to**: D-C10-6 + +**Description**: run `build_artifacts` on the bench Jetson; for each engine, parse the internal TRT version footer; assert the quadruple matches. + +**Input data**: bench Jetson + Derkachi C6 fixture. + +**Expected result**: all engines tagged correctly. + +**Max execution time**: 6 min on Tier-2. + +--- + +## Performance Tests + +### C10-PT-01: build wall-clock budget on Tier-1 (operator-tooling laptop) + +**Traces to**: operator-tooling UX (no AC trace; an operator-tooling SLO) + +**Load scenario**: full Derkachi corpus (10 GB, ~87 654 tiles). + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| Cold build wall-clock | ≤ 12 min on a developer laptop with NVIDIA GPU | 25 min | +| Warm idempotent re-run | ≤ 1 min | 3 min | + +--- + +## Security Tests + +### C10-ST-01: signing-key path uses operator-controlled key (not a baked-in dev key) + +**Summary**: the build refuses to sign the Manifest if the configured signing-key path points to the baked-in dev key (caught via a hash-list check). + +**Traces to**: defensive (production-key safety) + +**Test procedure**: +1. Configure C10 with the dev-key path that's hard-coded into the dev fixtures. +2. Run `build_artifacts`. +3. Assert refusal with `OperatorKeyRequiredError`. + +**Pass criteria**: refusal. +**Fail criteria**: build succeeds with the dev key. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-15 / FT-P-16 (operator workflow tests). + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| `tests/fixtures/c6_populated_derkachi/` | C11 `TileDownloader` build artifact | ~10 GB on disk | +| Operator signing key (test-only) | generated per test run | <1 KB | +| Dev key (for the negative test) | curated, in-repo | <1 KB | + +**Setup**: C11 `TileDownloader` integration test (under C11) populates C6 once; that artifact is reused. +**Teardown**: per-test temp dirs for `cache_artifacts/` build outputs. +**Data isolation**: per-test temp `cache_artifacts/`. diff --git a/_docs/02_document/components/12_c11_tilemanager/description.md b/_docs/02_document/components/12_c11_tilemanager/description.md new file mode 100644 index 0000000..b4434ee --- /dev/null +++ b/_docs/02_document/components/12_c11_tilemanager/description.md @@ -0,0 +1,201 @@ +# C11 — Tile Manager + +## 1. High-Level Overview + +**Purpose**: own the operator-side network I/O against `satellite-provider` for the onboard tile corpus, in **both directions**: + +- **Download** (pre-flight, F1): fetch tiles from `satellite-provider` for the operational area, apply AC-NEW-6 freshness gating, and write into C6 (`TileStore` + `TileMetadataStore`). C11 is the **only** path that crosses the workstation/companion enclave to the parent suite for tile pixels — C10 reads from the populated C6 store and never touches `satellite-provider` itself. +- **Upload** (post-landing, F10): when `flight_state == ON_GROUND` is confirmed, read pending mid-flight tiles from C6 and POST to `satellite-provider`'s ingest endpoint (D-PROJ-2 contract sketch). + +C11 is a **separate operator-side binary / image**. The airborne companion image's CMake target deliberately excludes the entire `c11_tilemanager/` source tree so the airborne process cannot accidentally execute either the download path or the upload path even via reflection or config error (ADR-004 process-level isolation, AC-8.4). Both directions of tile I/O are operator-driven on the operator workstation; the companion only consumes the populated C6 store while airborne. + +**Architectural Pattern**: Pipeline behind two interfaces (`TileDownloader`, `TileUploader`) under one component, consistent with C8's multi-interface shape (FC-AP, FC-iNav, GCS adapters under one component). The two interfaces are bundled into C11 because they share auth (TLS + service-internal API key for download, per-flight onboard signing key for upload), HTTP client, network configuration, deployment unit (operator-tooling tarball), and the airborne-exclusion property — splitting them into two components would duplicate all of that. They are kept as **two interfaces** so SRP is preserved at the call-site level: C12 binds `TileDownloader` for the F1 cache-build workflow, `TileUploader` for the F10 post-landing trigger; neither is forced to depend on the other. + +**Upstream dependencies**: + +- C12 OperatorTooling → invokes `TileDownloader.download_tiles_for_area(...)` during F1 and `TileUploader.upload_pending_tiles(...)` post-landing. +- C6 TileStore + TileMetadataStore → write target during download (`source = googlemaps`); read source during upload (`source = onboard_ingest`, `voting_status = pending`). +- Operator workstation OS → invocation entry point (CLI / tray app, owned by C12). +- `satellite-provider` (external) → `GET /api/satellite/tiles?bbox=…&zoom=…` for download; `POST /api/satellite/tiles/ingest` for upload (D-PROJ-2 design task #1, **planned, not yet implemented service-side**). + +**Downstream consumers**: + +- C10 CacheProvisioner reads the populated C6 store after a `TileDownloader` run completes; C10 does not call C11 directly. C12 sequences the two steps. +- On the upload side: none on the onboard side; the parent-suite voting layer (D-PROJ-2 design task #2) consumes the uploaded tiles asynchronously. + +## 2. Internal Interfaces + +### Interface: `TileDownloader` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `download_tiles_for_area` | `DownloadRequest` | `DownloadBatchReport` | No (offline; minutes) | `SatelliteProviderError`, `FreshnessRejectionError`, `ResolutionRejectionError`, `CacheBudgetExceededError`, `IdempotentNoOp` | +| `enumerate_remote_coverage` | `bbox: BoundingBox, zoom_levels: list[int]` | `list[TileSummary]` | No | `SatelliteProviderError` | + +### Interface: `TileUploader` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `confirm_flight_state` | `()` | `FlightStateSignal` (must be ON_GROUND) | No | `FlightStateNotOnGroundError` | +| `enumerate_pending_tiles` | `flight_id: uuid (optional)` | `list[TileMetadata]` | No | `TileMetadataError` | +| `upload_pending_tiles` | `UploadRequest` | `UploadBatchReport` | No | `SatelliteProviderError`, `RateLimitedError`, `SignatureRejectedError` | + +**Input/Output DTOs**: + +``` +DownloadRequest: + bbox: BoundingBox (lat_min, lon_min, lat_max, lon_max) + zoom_levels: list[int] + sector_class: enum {active_conflict, stable_rear} + satellite_provider_url: URL + service_api_key: string + cache_root: Path + +DownloadBatchReport: + tiles_downloaded: int + tiles_rejected_freshness: int + tiles_rejected_resolution: int # RESTRICT-SAT-4 < 0.5 m/px + tiles_downgraded: int + freshness_summary: dict[freshness_label, count] + outcome: enum {success, failure, idempotent_no_op} + failure_reason: string (optional) + +UploadRequest: + flight_id: uuid (optional; defaults to all flights with pending tiles) + batch_size: int + satellite_provider_url: URL + +FlightStateSignal: see C8 — must be ON_GROUND for any upload to proceed + +UploadBatchReport: + batch_uuid: uuid (assigned by satellite-provider per D-PROJ-2 contract) + per_tile_status: list[(tile_id, status: enum {queued, rejected, duplicate, superseded})] + retry_count: int + next_retry_at_s: int (when retried) + outcome: enum {success, partial, failure} +``` + +## 3. External API Specification + +C11 is a **client** of `satellite-provider`'s REST surface in both directions. + +### 3.1 Download — read path (existing `satellite-provider` API) + +| Endpoint | Method | Auth | Rate Limit | Description | +|----------|--------|------|------------|-------------| +| `/api/satellite/tiles?bbox=…&zoom=…` | GET | TLS + service-internal API key | parent-suite enforces | Paged tile blobs + metadata for a bounding box at the given zoom level(s). | + +C11 honours `Retry-After` on 429s, fails fast on TLS / auth errors, retries with backoff on 5xx. Resolution below 0.5 m/px (RESTRICT-SAT-4) is rejected at the C11 boundary, not pushed downstream. + +### 3.2 Upload — write path (D-PROJ-2 contract sketch, **planned**) + +| Endpoint | Method | Auth | Rate Limit | Description | +|----------|--------|------|------------|-------------| +| `/api/satellite/tiles/ingest` (parent-suite, **planned**) | POST | Per-flight onboard signing key (D-C8-9 = (d) family); each tile carries the signature | parent-suite enforces | Multipart upload of one or more tiles; response 202 with batch UUID + per-tile status. | + +**Request schema** (multipart fields per tile): + +- `tile_blob` (JPEG body, byte-identical to `satellite-provider`'s existing tile format) +- `zoomLevel` (int) +- `latitude` / `longitude` (double) +- `tile_size_meters` (double), `tile_size_pixels` (int) +- `capture_timestamp` (ISO 8601), `flight_id` (UUID), `companion_id` (string) +- `quality_metadata` (JSON; `TileQualityMetadata` per data_model.md) +- `signature` (per-flight onboard signing key signature over the payload) + +**Response**: 202 Accepted + `{batch_uuid: UUID, per_tile_status: [...]}`. + +Test substitute during NFT-SEC-01 / FT-P-17 / IT runs: the e2e-test `mock-suite-sat-service` fixture (under `tests/fixtures/mock-suite-sat-service/`) implements the planned POST surface so upload integration tests can run before D-PROJ-2 ships service-side. Download integration tests run against the **real** `satellite-provider` (its existing GET surface is already implemented). The mock is not a component and is never reached in production. + +## 4. Data Access Patterns + +C11 reads from / writes to C6 (the local store) and reads from / writes to `satellite-provider` (network). It owns no relational state of its own beyond a small download-progress journal and a small upload-progress journal. + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| Download-progress journal | filesystem alongside the operator workstation cache root | until a `download_tiles_for_area` run completes | per-area run on completion | +| Pending-upload journal | filesystem alongside the operator workstation cache root | until upload acknowledged | per-batch acknowledgment removes from journal | + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| Download-progress journal | a few hundred rows per area provisioning | ~256 B / row | <1 MB | per provisioning run | +| Pending-upload journal | a few hundred per flight | ~256 B / row | <1 MB | per flight | + +### Data Management + +**Seed data**: none — both journals are empty until the operator triggers a download or an upload run. + +**Rollback**: the download path is idempotent — re-running `download_tiles_for_area` for an unchanged `(bbox, zoom_levels, sector_class)` triggers C10's manifest-hash check (D-C10-1) downstream and the engine/descriptor build is skipped. The upload path is idempotent on the service side via the `(zoomLevel, lat, lon, capture_timestamp, companion_id, flight_id)` dedup key. + +## 5. Implementation Details + +**Algorithmic Complexity**: + +- Download: linear in tile count; bandwidth-bound by the operator workstation's link to `satellite-provider`. +- Upload: linear in pending tile count; bandwidth-bound; bursty post-landing. + +**State Management**: stateless except for the two journals. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| httpx | per project pin | GET (download) + multipart POST (upload) to `satellite-provider` | +| atomicwrites | latest | Journal updates | +| cryptography | per project pin | Per-flight signing key (upload payload signing); the production `satellite-provider` ingest endpoint and the e2e-test `mock-suite-sat-service` fixture both verify with the same key family | + +**Error Handling Strategy**: + +- `SatelliteProviderError`: HTTP timeout / 5xx / TLS failure on either direction. Retry-with-backoff on 5xx; fail fast on TLS / auth. On download, surface to operator + takeoff blocked. On upload, leave tiles in the pending-upload journal and surface to operator. **Do not delete uploaded tiles from C6** until acknowledged. +- `RateLimitedError` (429): obey `Retry-After`; the operator can also re-invoke later. Same handling either direction. +- `FreshnessRejectionError` / `ResolutionRejectionError`: download-side only. Per AC-NEW-6 / RESTRICT-SAT-4 — never silently downgrade fresh-required tiles in `active_conflict` sectors. Surface counts in the `DownloadBatchReport`. +- `CacheBudgetExceededError`: download-side only. Pre-flight free-space check against AC-8.3 (≤ 10 GB). Fail fast with explicit budget delta; no partial write. +- `FlightStateNotOnGroundError`: upload-side only. Refuse to start; log + show explicit reason. ADR-004 process-level isolation means C11 should never run when the FC believes it's airborne — this error is a defense-in-depth, not the primary control. +- `SignatureRejectedError`: upload-side only. Per-flight signing key was rejected by `satellite-provider`. This is a security-critical event — do NOT silently drop; surface to operator + log to FDR. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `TileSignaturePayloadBuilder` | constructs the signed payload for D-PROJ-2 contract (upload) | C11 only — keep inside the component | + +## 7. Caveats & Edge Cases + +**Known limitations**: + +- D-PROJ-2 ingest endpoint is NOT yet implemented service-side. Until parent-suite delivers the endpoint, C11 will fail every upload — the pending-upload journal accumulates. Operator workflow tolerates this. +- The e2e-test `mock-suite-sat-service` fixture implements only the planned POST contract (per the leftover file). Download integration tests run against the real `satellite-provider`. Production runs reach `satellite-provider` directly in both directions; the fixture is never on the production path. +- `TileDownloader` requires the operator workstation to have network reach to `satellite-provider` (the only path that crosses out of the workstation enclave). Pre-flight network configuration is an operator concern owned by C12; C11 fails fast if reachability is missing. + +**Potential race conditions**: + +- If the operator launches two `TileDownloader` runs concurrently against the same cache root, a filesystem lockfile (operated by C12 tooling) prevents corrupting C6's tile rows. Same lockfile gates concurrent `TileUploader` invocations. + +**Performance bottlenecks**: + +- Download: bandwidth-bound by the operator workstation's `satellite-provider` link; descriptor / engine work is downstream in C10 (offline, minutes). +- Upload: bandwidth-bound. Per-flight upload volume is bounded by the F4 mid-flight tile gen cap (typically a few hundred tiles, each 50–200 KB → tens of MB per flight). + +## 8. Dependency Graph + +**Must be implemented after**: C6 (read source for upload, write target for download), `satellite-provider` (download path; existing) + D-PROJ-2 endpoint (upload path; the e2e-test `mock-suite-sat-service` fixture covers tests until the real endpoint ships). + +**Can be implemented in parallel with**: anything except C6 changes. + +**Blocks**: F1 (pre-flight cache build cannot start without `TileDownloader`), F10 (post-landing upload cannot start without `TileUploader`). + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `FlightStateNotOnGroundError`, `SignatureRejectedError`, persistent `SatelliteProviderError`, `CacheBudgetExceededError` | `C11 refused to start: flight_state=IN_AIR; safeguard active` | +| WARN | one-off network failure, scheduled retry, freshness-driven rejections (counts) | `C11 batch upload retry: batch_uuid=…; next_retry_in_s=30` | +| INFO | session start/end; per-batch report (download + upload) | `C11 download complete: 87654 tiles, 12 stale-rejected; bbox=…` | +| DEBUG | per-tile request/response | `C11 tile uploaded: tile_id=(z=18,lat=…,lon=…); status=queued` | + +**Log format**: structured JSON. +**Log storage**: operator workstation log file (e.g. `~/.azaion/onboard/c11-tilemanager.log`); also writes per-run summaries (download report, upload report) to the operator workstation cache root for audit. The companion's FDR is NOT involved (C11 doesn't run on the companion). diff --git a/_docs/02_document/components/12_c11_tilemanager/tests.md b/_docs/02_document/components/12_c11_tilemanager/tests.md new file mode 100644 index 0000000..ff887b5 --- /dev/null +++ b/_docs/02_document/components/12_c11_tilemanager/tests.md @@ -0,0 +1,213 @@ +# Test Specification — C11 Tile Manager (TileDownloader + TileUploader) + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. C11 was renamed and expanded in this Plan cycle: it now owns BOTH operator-side network I/O directions against `satellite-provider`. Strict ADR-004 enforcement: never loaded into the airborne companion image. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-8.1 | Imagery via Suite Sat Service offline cache, ≥0.5 m/px | FT-P-15, **C11-IT-01** (download) | Covered | +| AC-8.2 | Tile freshness <6 mo (active-conflict) / <12 mo (rear) | FT-N-05, **C11-IT-02** (download-side gate) | Covered | +| AC-8.3 | Imagery pre-loaded onto companion before flight | FT-P-15, FT-P-16, **C11-IT-01** | Covered | +| AC-NEW-6 | System rejects/downgrades stale tiles | FT-N-05, FT-N-06, **C11-IT-02** | Covered | +| AC-8.4 + D-PROJ-2 (POST contract) | Mid-flight tile generation + post-landing upload to satellite-provider | FT-P-17, **C11-IT-03** (upload) | Covered (against e2e-test fixture until D-PROJ-2 ships) | +| RESTRICT-SAT-1 | Onboard cache offline-only; no in-flight Service calls | NFT-SEC-02, NFT-SEC-05, **C11-ST-01** | Covered | +| ADR-004 | Process isolation; C11 never airborne | **C11-ST-01**, **C11-ST-02** | Covered (R02 enforcement) | + +--- + +## Component-Internal Tests + +### C11-IT-01: TileDownloader fetch + sector-classified freshness gate + write to C6 + +**Summary**: given a Derkachi-area request, `TileDownloader.fetch` retrieves tiles from the real `satellite-provider` (or fixture in tests), enforces the per-sector freshness gate, and writes accepted tiles into C6 with byte-identical filesystem layout. + +**Traces to**: AC-8.1, AC-8.2, AC-8.3, AC-NEW-6 + +**Description**: configure the fetch job with the Derkachi bbox + zoom range + sector classification (active_conflict + stable_rear mix); run `fetch`; assert (a) all returned tiles within the bbox land in C6, (b) tiles with `produced_at` older than the per-sector threshold are downgrade-flagged or rejected, (c) the `DownloadBatchReport` reports counts that sum to the request size. + +**Input data**: scripted fetch request + the real `satellite-provider` (or its test Docker fixture under `tests/fixtures/satellite-provider/`; this is the REAL service's existing GET surface, not the mock). + +**Expected result**: tiles in C6 match request scope; freshness flags applied per sector. + +**Max execution time**: 4 min on Tier-1 (network-dependent). + +--- + +### C11-IT-02: freshness rejection counts surface in DownloadBatchReport + +**Summary**: when the source has stale tiles, `DownloadBatchReport.stale_rejected` is non-zero and matches the count rejected at the C6 boundary. + +**Traces to**: AC-NEW-6 (operator visibility) + +**Description**: run a fetch that hits a known-stale tile range; assert (a) `stale_rejected > 0`, (b) the value equals C6's freshness-rejection count for that batch. + +**Input data**: a synthetic `satellite-provider` response with stale tiles. + +**Expected result**: counts match. + +**Max execution time**: 60 s. + +--- + +### C11-IT-03: TileUploader posts mid-flight tiles + signs payload + +**Summary**: `TileUploader.upload_pending` reads mid-flight tiles from C6's pending-upload set, packages them per the D-PROJ-2 POST contract sketch, signs each payload with the per-flight key, and posts to the configured `/api/satellite/tiles/ingest` endpoint. + +**Traces to**: AC-8.4, D-PROJ-2 contract + +**Description**: stage C6 with 50 mid-flight tiles flagged pending-upload; bring up the e2e-test `mock-suite-sat-service` fixture (under `tests/fixtures/mock-suite-sat-service/`); call `upload_pending`; assert (a) all 50 tiles POSTed, (b) each payload signature verifies against the test public key, (c) on 202 Accepted, the C6 row is marked uploaded, (d) the fixture's request log shows all 50 tiles in arrival. + +**Input data**: scripted C6 + e2e-test mock fixture. + +**Expected result**: all 50 uploaded + acknowledged + marked. + +**Max execution time**: 5 min. + +--- + +### C11-IT-04: TileUploader gates on `flight_state == ON_GROUND` + +**Summary**: `TileUploader.upload_pending` refuses to run if `FlightStateSignal != ON_GROUND` (defense-in-depth atop ADR-004 process isolation). + +**Traces to**: AC-8.4 (defensive — ADR-004's secondary guard) + +**Description**: call `upload_pending` with `FlightStateSignal == IN_FLIGHT`; assert `UploadGateBlockedError`. Same with `UNKNOWN`. Set `ON_GROUND` and assert upload proceeds. + +**Input data**: scripted FlightStateSignal source. + +**Expected result**: upload blocked except in `ON_GROUND`. + +**Max execution time**: 30 s. + +--- + +### C11-IT-05: idempotent uploads on retry + +**Summary**: re-running `upload_pending` after a partial-success batch only POSTs the tiles that weren't acknowledged before. + +**Traces to**: D-PROJ-2 contract resilience + +**Description**: stage 50 pending-upload tiles; call `upload_pending` with the mock configured to return 202 for first 30, 5xx for last 20; assert C6 marks 30 as uploaded. Reset mock to 202 for all; call `upload_pending` again; assert only the remaining 20 are POSTed and the first 30 are NOT re-sent. + +**Input data**: scripted with controlled mock failure profile. + +**Expected result**: per assertion above. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C11-PT-01: download throughput + +**Traces to**: operator-tooling SLO (AC-8.3 supports this). + +**Load scenario**: 10 GB Derkachi area, parallel fetch. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| Throughput | ≥ 50 MB/s on a 1 Gbps link | < 20 MB/s | +| `DownloadBatchReport` produced | yes | no report = test failure | + +--- + +### C11-PT-02: upload throughput + +**Traces to**: post-landing operator UX (AC-8.4 + D-PROJ-2 timing assumption). + +**Load scenario**: 1000 mid-flight tiles, sequential POST + sign. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| Throughput | ≥ 20 tile/s with signing | < 10 tile/s | + +--- + +## Security Tests + +### C11-ST-01: airborne process cannot import `c11_tilemanager` + +**Summary**: per ADR-004 R02 enforcement, the airborne `production-binary` artifact has no path to import the C11 module. + +**Traces to**: ADR-004, R02 + +**Test procedure**: +1. Build the airborne `production-binary`. +2. Inside a sandbox running the artifact, attempt `import c11_tilemanager` (and any submodule). +3. Assert `ModuleNotFoundError` for every variant. +4. Run the `runtime_root.py` self-check; assert it does NOT panic (because `find_spec` returns `None`). + +**Pass criteria**: import fails everywhere; self-check passes silently. +**Fail criteria**: any successful import. + +--- + +### C11-ST-02: NFT-SEC-02 network-egress test + +**Summary**: the airborne process running in a no-route-to-`satellite-provider` network namespace cannot reach `satellite-provider` over TCP. + +**Traces to**: ADR-004 R02 enforcement; covers `RESTRICT-SAT-1`. + +**Test procedure**: as documented in `tests/security-tests.md` NFT-SEC-02. Listed here for traceability. + +**Pass criteria**: zero outbound connection attempts to `satellite-provider`'s host:port from the airborne process during a 10-min replay. +**Fail criteria**: any attempt. + +--- + +### C11-ST-03: per-flight signing key zeroised after upload completes + +**Summary**: after `upload_pending` completes (success or final failure), the per-flight key is zeroised in memory. + +**Traces to**: D-C8-9 = (d) (operator-side analogue) + +**Test procedure**: +1. Run an upload. +2. Capture the key buffer post-call via test harness. +3. Assert the buffer is zero-filled. + +**Pass criteria**: zero-filled. +**Fail criteria**: any non-zero residue. + +--- + +## Acceptance Tests + +### C11-AT-01: operator runs F1 download + F10 upload via CLI + +**Summary**: end-to-end operator flow exercises both interfaces via the C12-driven CLI. + +**Traces to**: AC-8.3, AC-8.4 + +**Preconditions**: +- Operator workstation with Docker + the operator-tooling tarball. +- A `satellite-provider` (real or test fixture) reachable. + +**Steps**: + +| Step | Action | Expected Result | +|------|--------|-----------------| +| 1 | `operator-tool download --area derkachi.geojson --since 2026-01` | `DownloadBatchReport` printed; tiles in C6 | +| 2 | `operator-tool build-cache` | C10 builds engines + descriptors + Manifest | +| 3 | (simulate flight) | (covered by other tests) | +| 4 | `operator-tool upload-pending` | Pending-upload tiles POSTed; report printed | + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Real `satellite-provider` Docker fixture (download) | upstream parent-suite Docker | image | +| e2e-test `mock-suite-sat-service` fixture (upload) | `tests/fixtures/mock-suite-sat-service/` | <50 MB image | +| Scripted Derkachi bbox + sector classification | scripted | <1 MB | + +**Setup**: bring up the appropriate fixture per test (real for download; e2e-test mock for upload until D-PROJ-2 ships). +**Teardown**: stop fixture containers; clean per-test C6. +**Data isolation**: per-test C6 + per-test fixture instance. diff --git a/_docs/02_document/components/13_c12_operator_tooling/description.md b/_docs/02_document/components/13_c12_operator_tooling/description.md new file mode 100644 index 0000000..07b7648 --- /dev/null +++ b/_docs/02_document/components/13_c12_operator_tooling/description.md @@ -0,0 +1,145 @@ +# C12 — Operator Pre-flight Tooling + +## 1. High-Level Overview + +**Purpose**: operator-facing tooling on the workstation that **sequences** the F1 cache-build workflow (calls C11 `TileDownloader` then C10 `CacheProvisioner`), drives sector classification, freshness pipeline, calibration loading, and (post-flight) triggers the C11 `TileUploader`. Provides the human-in-the-loop entry point for everything that needs operator judgement (active-conflict vs stable-rear sector classification, AC-3.4 ≥3-frame outage operator re-loc requests, network configuration to `satellite-provider`). + +**Architectural Pattern**: Coordinator (single concrete `OperatorTooling` today). Two interfaces: `CacheBuildWorkflow` (pre-flight UX) and `OperatorReLocService` (mid-flight operator re-loc requests, AC-3.4). The latter is consumed via the GCS link by C8 (operator commands subscription). + +**Upstream dependencies**: +- Operator (human input). +- C11 `TileDownloader` (operator-workstation-side) — invoked first in F1 to populate C6 from `satellite-provider`. +- C10 CacheProvisioner (companion-side) — invoked second in F1, over USB/Eth. +- Camera calibration artifact (operator workstation filesystem). + +**Downstream consumers**: +- C11 `TileDownloader` (workstation) — invoked at F1 start. +- C10 CacheProvisioner (companion) — invoked via USB/Eth after C11 download completes. +- C11 `TileUploader` — invoked post-landing. +- C8 inbound operator-commands subscription — receives AC-3.4 re-loc requests over the GCS link. + +## 2. Internal Interfaces + +### Interface: `CacheBuildWorkflow` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `build_cache` | `bbox, sector_class, calibration_path, satellite_provider_url, api_key` | `CacheBuildReport` (wraps C11 `DownloadBatchReport` + C10 `BuildReport`) | No (operator-facing; minutes) | `CacheBuildError` (wraps SatelliteProviderError, EngineBuildError, etc.) | +| `trigger_post_landing_upload` | `flight_id` | C11 `UploadBatchReport` | No (operator-facing; minutes) | `CacheBuildError` wrapper around `FlightStateNotOnGroundError`, `SignatureRejectedError`, etc. | +| `verify_companion_ready` | `companion_address` | `ReadinessReport` | No | `CompanionUnreachableError`, `ContentHashMismatchError` | +| `set_sector_classification` | `area, sector_class` | `None` | No | — | +| `apply_freshness_threshold` | `sector_class` | `int (months)` | No | — | + +### Interface: `OperatorReLocService` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `request_reloc` | `reloc_hint: ReLocHint` | `None` (sent over GCS link to companion) | No | `GcsLinkError` | + +**Input/Output DTOs**: +``` +CacheBuildReport: + download_report: DownloadBatchReport (see C11 spec) + build_report: BuildReport (see C10 spec) + outcome: enum {success, failure, idempotent_no_op} + failure_phase: enum {download, build, none} + failure_reason: string (optional) + +ReadinessReport: + manifest_present: bool + content_hashes_pass: bool + engines_present: bool + calibration_present: bool + outcome: enum {ready, not_ready} + not_ready_reasons: list[string] + +ReLocHint: + approximate_position_wgs84: LatLonAlt (operator's best guess) + confidence_radius_m: float + reason: string (e.g. "lost track at waypoint 3") +``` + +## 3. External API Specification + +C12 itself does NOT expose HTTP. It **consumes**: + +- `satellite-provider`'s REST API **only via C11 `TileDownloader`** — C12 itself never holds the TLS / API-key credentials. The credential boundary is C11 (operator-workstation-side); C12 sequences and reports. +- The GCS link's MAVLink path (operator commands path), for AC-3.4 re-loc requests. + +C12 may eventually expose a thin local HTTP API for an operator GUI. **Plan-phase carryforward, deferred** — the current cycle ships only a CLI. + +## 4. Data Access Patterns + +C12 holds the operator workstation's local cache staging area + per-area sector classification metadata. + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| Sector classification persistence | tens of areas | ~1 KB / area | <100 KB | per operator deployment | +| Local cache staging area | matches the area being prepared | up to ~10 GB pre-stage | bounded by operator workstation NVM | per operation | + +### Data Management + +**Seed data**: none — operator drives every decision. + +**Rollback**: rebuilding the cache from scratch is the rollback action; D-C10-1 idempotence makes it cheap if no inputs changed. + +## 5. Implementation Details + +**Algorithmic Complexity**: dominated by C11 `TileDownloader` bandwidth + C10's descriptor-batching cost (both offline). C12's own logic is constant-time per operator decision. + +**State Management**: holds a session-local view of the operator's classifications + active provisioning job. No long-lived state on the companion. + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| Click / Typer | per project pin | CLI framework | +| paramiko / ssh / fabric | per project pin | Companion bring-up over SSH (USB/Eth) | +| pymavlink | bundled per D-C8-3 | GCS-link operator commands (AC-3.4) | +| (httpx is NOT a direct C12 dep) | — | All `satellite-provider` HTTP traffic is owned by C11; C12 calls C11 in-process | + +**Error Handling Strategy**: +- `CacheBuildError`: wraps the underlying error from C11/C10/C7/C6 with operator-friendly text + remediation hint. Includes `failure_phase: download | build` so the operator knows which step to retry. +- `CompanionUnreachableError`: pre-flight bring-up failure; operator must check the wire/SSH config. +- `ContentHashMismatchError`: post-stage tampering detected; refuse to mark as ready; operator must re-run F1. +- `GcsLinkError` (AC-3.4 path): GCS link drop. Re-loc request is best-effort; operator may need to re-issue when link recovers. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `WgsConverter` | shared with C4, C5, C6, C8, C10 | C4, C5, C6, C8, C10, C12 | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- Operator UX is CLI-only this cycle; GUI is a Plan-phase carryforward (deferred). +- AC-3.4 operator re-loc is best-effort over a bandwidth-limited GCS link; the companion must remain useful even when no re-loc hint arrives. + +**Potential race conditions**: +- Operator re-runs of F1 while a previous F1 is still in progress: filesystem lockfile in the cache staging area prevents concurrency. + +**Performance bottlenecks**: +- Pre-flight tile download (C11) + descriptor build (C10) is the long pole; C12 just shows progress and surfaces failures. + +## 8. Dependency Graph + +**Must be implemented after**: C11 (both `TileDownloader` and `TileUploader`), C10 (cache artifact build), C8 inbound operator-commands subscription (for AC-3.4). + +**Can be implemented in parallel with**: anything not in the above list. + +**Blocks**: F1 (without C12 there is no operator entry point), F10 (without C12 there is no post-landing trigger). + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `CacheBuildError`, `CompanionUnreachableError`, `ContentHashMismatchError`, `GcsLinkError` | `C12 cache build failed: reason=satellite_provider_5xx; retried=3` | +| WARN | freshness downgrade applied; non-fatal stage warning | `C12 freshness downgrade: area=Derkachi; stale_count=42` | +| INFO | start/end of provisioning; sector classification set; AC-3.4 hint sent | `C12 sector classification updated: area=Derkachi; class=active_conflict` | +| DEBUG | per-step progress mirrors of C11 (download) and C10 (build) | `C12 progress: download=12345/87654 (14%); build=engine=2/4` | + +**Log format**: structured JSON. +**Log storage**: operator workstation log file (e.g. `~/.azaion/onboard/c12-tooling.log`). diff --git a/_docs/02_document/components/13_c12_operator_tooling/tests.md b/_docs/02_document/components/13_c12_operator_tooling/tests.md new file mode 100644 index 0000000..0e7325a --- /dev/null +++ b/_docs/02_document/components/13_c12_operator_tooling/tests.md @@ -0,0 +1,143 @@ +# Test Specification — C12 Operator Pre-flight Tooling + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. C12 sequences the F1 (C11 download → C10 build) and F10 (C11 upload trigger) operator-side flows. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-3.4 | Operator-relocalization on visual loss | FT-N-03, **C12-IT-01** | Covered | +| AC-8.3 | Imagery pre-loaded onto companion before flight | FT-P-15, FT-P-16, **C12-IT-02** | Covered | +| AC-8.4 | Mid-flight tile upload trigger (post-landing) | FT-P-17, **C12-IT-03** | Covered | +| AC-NEW-1 | Cold-start TTFF (cache must be in place pre-flight) | NFT-PERF-03, **C12-IT-02** | Covered (pre-flight side) | + +--- + +## Component-Internal Tests + +### C12-IT-01: operator re-localization workflow returns SUT to satellite-anchored + +**Summary**: when the SUT requests operator re-loc per AC-3.4, the operator workflow exposes the request, lets the operator confirm a candidate location, and returns the system to `satellite_anchored` within the configured budget. + +**Traces to**: AC-3.4 + +**Description**: scripted scenario — SUT publishes a re-loc request via FDR + GCS STATUSTEXT; operator-side C12 catches it; an automated test "operator" confirms the candidate; assert (a) the system transitions back to `satellite_anchored` within 30 s, (b) the confirmation event lands in FDR. + +**Input data**: scripted re-loc request fixture. + +**Expected result**: per assertion. + +**Max execution time**: 60 s. + +--- + +### C12-IT-02: build-cache orchestrates C11 download then C10 build + +**Summary**: `build_cache(area, sector_classification)` calls `C11.TileDownloader.fetch` first, then `C10.CacheProvisioner.build_artifacts`, and produces a `CacheBuildReport` wrapping both sub-reports. + +**Traces to**: AC-8.3, AC-NEW-1 + +**Description**: with a fresh empty C6, call `build_cache` for a small Derkachi sub-area; assert (a) C11's `DownloadBatchReport` is in the result, (b) C10's `BuildReport` is in the result, (c) call ordering: C11 completes before C10 starts (verifiable via a test-side hook on each component), (d) on a download failure (mock 5xx), C10 is NOT invoked and the error surfaces in `CacheBuildReport`. + +**Input data**: small Derkachi sub-area + scripted fetch + a controllable mock layer. + +**Expected result**: per assertion. + +**Max execution time**: 5 min. + +--- + +### C12-IT-03: trigger_post_landing_upload invokes C11 TileUploader on confirmed ON_GROUND + +**Summary**: `trigger_post_landing_upload` reads the most recent `FlightStateSignal` from the post-flight FDR; if `ON_GROUND` is confirmed for ≥ a configurable safety threshold (default 30 s), it invokes `C11.TileUploader.upload_pending`. If `ON_GROUND` is not confirmed, it refuses and returns a clear error. + +**Traces to**: AC-8.4 + +**Description**: stage two flight FDR fixtures — one ending with confirmed ON_GROUND for 60 s, one ending with `IN_FLIGHT` (incomplete log). Call `trigger_post_landing_upload`; assert (a) first case invokes upload, (b) second case refuses with `FlightStateNotConfirmedError`. + +**Input data**: 2 scripted FDR fixtures. + +**Expected result**: per assertion. + +**Max execution time**: 60 s. + +--- + +### C12-IT-04: CacheBuildReport surfaces operator-actionable failures + +**Summary**: when C10 reports a `ManifestSignatureError` or C11 reports stale-tile rejections beyond a configurable threshold, `CacheBuildReport` flags the build as actionable and returns a non-zero exit code from the CLI. + +**Traces to**: AC-8.2 + operator UX + +**Description**: simulate (a) C10 manifest signature failure, (b) C11 stale-rejection rate > 30%; assert in each case the CLI exits non-zero and the report includes a clear next-action ("re-run fetch with --since=…" or "rotate signing key"). + +**Input data**: scripted failure injection. + +**Expected result**: actionable failure messages + non-zero exit. + +**Max execution time**: 60 s. + +--- + +## Performance Tests + +### C12-PT-01: end-to-end operator workflow wall-clock + +**Traces to**: AC-NEW-1 (pre-flight side; post-landing side covered by C11-PT-02). + +**Load scenario**: full Derkachi area; cold C6. + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| `build_cache` wall-clock | ≤ 18 min on a developer laptop with NVIDIA GPU (10 min download + 8 min build with overlap) | 35 min | + +--- + +## Security Tests + +### C12-ST-01: CLI rejects writes to airborne images + +**Summary**: the operator-tool CLI has no command path that writes into the airborne `production-binary` image (defends against operator-side mistakes that would defeat ADR-004). + +**Traces to**: ADR-004 R02 enforcement (C12 side) + +**Test procedure**: +1. Enumerate every CLI subcommand. +2. For each, assert the implementation does not import `c11_tilemanager` into a context that gets packaged into the airborne image (this is structural — verified at build time by the SBOM diff and at code-review time). +3. Run a runtime smoke test: invoke each subcommand with `--help`; assert no warning about C11 being loaded. + +**Pass criteria**: no command path crosses into the airborne package boundary. +**Fail criteria**: any cross-boundary import. + +--- + +## Acceptance Tests + +### C12-AT-01: full operator workflow on a fresh workstation + +**Summary**: an operator with a fresh workstation can install the operator-tooling tarball and complete download → build → simulated-flight → upload in one session. + +**Traces to**: AC-8.3, AC-8.4 + +**Preconditions**: +- Fresh workstation with Docker installed. +- Operator-tooling tarball present. +- Real `satellite-provider` reachable (for download); e2e-test mock fixture (for upload) bundled in the tarball. + +**Steps**: see `C11-AT-01` (C12 is the orchestrator that AT-01 exercises). + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Operator-tooling tarball | CI build artifact | varies | +| FDR fixtures (ON_GROUND-confirmed and IN_FLIGHT) | scripted | <100 MB each | +| Small Derkachi sub-area for C12-IT-02 | scripted | <500 MB | + +**Setup**: extract operator-tooling tarball; bring up Docker compose. +**Teardown**: stop containers; clean per-test C6. +**Data isolation**: per-test workspace under `tests/tmp/c12//`. diff --git a/_docs/02_document/components/14_c13_fdr/description.md b/_docs/02_document/components/14_c13_fdr/description.md new file mode 100644 index 0000000..d868d28 --- /dev/null +++ b/_docs/02_document/components/14_c13_fdr/description.md @@ -0,0 +1,136 @@ +# C13 — Flight Data Recorder (FDR) + +## 1. High-Level Overview + +**Purpose**: persist a per-flight ≤ 64 GB record of every payload class onboard (estimates, IMU traces, emitted MAVLink, system health, mid-flight tiles, ≤0.1 Hz failed-tile thumbnails) without silently dropping data (AC-NEW-3). Exclude raw nav/AI-cam frames (AC-8.5; only the failed-tile thumbnail forensic exception is allowed). The FDR is the system's audit log: every safety-critical decision, every emitted frame, every signing key rotation, every spoof-promotion-block lands here. + +**Architectural Pattern**: single concrete `FileFdrWriter` behind a `FdrWriter` interface. Single writer thread fed by lock-free in-process queues from every component. Lossy on writer-thread overrun **only by logging the rollover event**, never silently. + +**Upstream dependencies**: every component publishes to C13 via in-process pub/sub (drop-oldest-with-rollover-log on overrun). + +**Downstream consumers**: +- Post-flight: operator workstation (read via C12 retrieval). +- Real-time: nothing — C13 is write-only at runtime. + +## 2. Internal Interfaces + +### Interface: `FdrWriter` + +| Method | Input | Output | Async | Error Types | +|--------|-------|--------|-------|-------------| +| `open_flight` | `FlightHeader` | `None` | No (called once at takeoff) | `FdrOpenError` | +| `write_record` | `FdrRecord` | `None` | No (lock-free enqueue) | `FdrQueueOverrunError` (logged but does not raise) | +| `close_flight` | `()` | `FlightFooter` | No (called once at landing) | — | +| `current_size_bytes` | `()` | `int` | No | — | +| `is_rolling` | `()` | `bool` | No | — | + +**Input/Output DTOs**: +``` +FlightHeader: + flight_id: uuid + flight_started_at: ISO 8601 + monotonic_ns + config_snapshot: JSON + signing_key_rotation_event: record + manifest_content_hashes: dict[Path, sha256] + +FdrRecord: see data_model.md (FdrRecord; tagged union over payload classes) + +FlightFooter: + flight_ended_at: ISO 8601 + monotonic_ns + records_written: int + records_dropped_overrun: int + bytes_written: int + rollover_count: int +``` + +## 3. External API Specification + +Not applicable. + +## 4. Data Access Patterns + +| Query | Frequency | Hot Path | Index Needed | +|-------|-----------|----------|--------------| +| `write_record` from every component | up to ~100 Hz aggregate | Yes | n/a | +| Post-flight read (operator retrieval) | once per flight | No | filesystem layout per `(flight_id, segment)` | + +### Caching Strategy + +| Data | Cache Type | TTL | Invalidation | +|------|-----------|-----|-------------| +| In-process queue from each producer | bounded ring (drop-oldest with rollover log) | flight lifetime | per-record write | +| Writer-thread buffer | sized for ≥1 s of typical write load | flight lifetime | flush on segment rollover | + +### Storage Estimates + +| Table/Collection | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | +|-----------------|---------------------|----------|------------|-------------| +| Per-flight record file (segmented, oldest-segment-dropped policy) | bounded by 64 GB per AC-NEW-3 | varies per payload class | ≤ 64 GB / flight | bounded by AC-NEW-3 | +| Per-flight tile snapshots (mid-flight tiles) | ~few hundred / flight | 50–200 KB each | up to ~50 MB / flight | bounded by F4 mid-flight gen | +| Per-flight failed-tile thumbnails (AC-8.5 forensic exception) | ≤ 0.1 Hz × 8 h = ≤ 2880 thumbnails / flight | small JPEG | <50 MB | bounded by ≤ 0.1 Hz cap | + +### Data Management + +**Seed data**: none. + +**Rollback**: per-segment file layout makes per-segment deletion safe. The writer never overwrites a closed segment; it only appends to the current open segment, then opens a new segment when the previous reaches a configurable size cap. + +## 5. Implementation Details + +**Algorithmic Complexity**: per-record cost is `O(record_size)` for serialisation + write. Aggregate throughput sized for the worst-case AC-NEW-3 cap. + +**State Management**: +- Owns the open per-flight segment file handle. +- Owns the writer thread and the in-process producer queues. +- Owns the rollover policy (oldest-segment-dropped first when total reaches 64 GB). + +**Key Dependencies**: + +| Library | Version | Purpose | +|---------|---------|---------| +| orjson / msgpack | per project pin | Record serialisation (serialised format choice during decompose phase) | +| atomicwrites | latest | Segment file rotation (atomic open of new segment + close of previous) | +| filelock | per project pin | Cross-process safety for the FDR root (operator-tool reads while companion writes — companion-only access during flight) | + +**Error Handling Strategy**: +- `FdrOpenError` at takeoff: refuse takeoff (per AC-NEW-3 every payload class must be present from t=0). +- `FdrQueueOverrunError`: per-producer drop-oldest, but the rollover event itself is ALWAYS logged (a separate "overrun" record in the FDR records the dropped count and producer-id). Never silent. +- Filesystem write failure mid-flight: log to stdout/stderr (since we can't log to FDR at this point) + STATUSTEXT to GCS; the system continues to emit external positions because losing the audit log doesn't compromise navigation, but the operator must be alerted. + +## 6. Extensions and Helpers + +| Helper | Purpose | Used By | +|--------|---------|---------| +| `RecordSchema` | versioned record schema for cross-version FDR compatibility | C13 only — this is internal | + +## 7. Caveats & Edge Cases + +**Known limitations**: +- 64 GB cap is per AC-NEW-3. If payload-class throughput grows beyond what the cap supports for an 8 h flight, the producers MUST throttle or accept oldest-dropped — the FDR will not silently exceed the cap. +- Failed-tile thumbnail forensic exception is the ONLY raw-imagery-adjacent persistence; AC-8.5 must be re-asserted if any new payload class is added. + +**Potential race conditions**: +- The writer thread is the single writer; producers enqueue lock-free. No filesystem contention from within the companion. Operator-tool reads happen post-landing only. + +**Performance bottlenecks**: +- Writer-thread serialisation throughput must exceed peak producer throughput. NFT-LIM-02 (8 h synthetic AC-NEW-3) validates. + +## 8. Dependency Graph + +**Must be implemented after**: nothing internal — C13 is foundational along with C7. + +**Can be implemented in parallel with**: every other component. + +**Blocks**: every component (every component logs to C13). + +## 9. Logging Strategy + +| Log Level | When | Example | +|-----------|------|---------| +| ERROR | `FdrOpenError`, mid-flight filesystem write failure | `C13 segment write failure: errno=ENOSPC; STATUSTEXT to GCS` | +| WARN | queue overrun (any producer) | `C13 queue overrun: producer=c5_state; dropped_count=23` | +| INFO | open/close flight; segment rollover | `C13 flight opened: flight_id=…; segment=0` | +| DEBUG | per-write timing (only in dev tier) | `C13 record written: kind=estimate; bytes=412; took=0.1ms` | + +**Log format**: structured JSON to stdout/journald. +**Log storage**: stdout / journald — but not C13 itself for ERROR (we'd be writing to the broken thing). FDR records are the project-level "logs" for everything except C13's own operational status. diff --git a/_docs/02_document/components/14_c13_fdr/tests.md b/_docs/02_document/components/14_c13_fdr/tests.md new file mode 100644 index 0000000..c557526 --- /dev/null +++ b/_docs/02_document/components/14_c13_fdr/tests.md @@ -0,0 +1,166 @@ +# Test Specification — C13 Flight Data Recorder + +Component-scoped. Suite-level coverage in `_docs/02_document/tests/*.md`. + +## Acceptance Criteria Traceability + +| AC ID | Acceptance Criterion (one-line) | Test IDs | Coverage | +|-------|---------------------------------|----------|----------| +| AC-1.4 | 95% covariance + source label (FDR record class) | FT-P-03, **C13-IT-01** | Covered | +| AC-4.5 (revised) | Internal smoothing past keyframes (FDR-only path) | FT-P-10, **C13-IT-02** | Covered | +| AC-8.5 | No raw nav/AI-cam frame retention except thumbnail log | FT-P-18, **C13-IT-03** | Covered | +| AC-NEW-3 | FDR ≤64 GB / flight, no silent drops | NFT-LIM-02, **C13-IT-04** | Covered | +| RESTRICT-UAV-4 | No raw-photo storage; tile cache + FDR only | FT-P-18, NFT-LIM-03 | Covered | + +--- + +## Component-Internal Tests + +### C13-IT-01: every payload class produces an FDR record + +**Summary**: every component documented as an FDR producer publishes records of the right `kind`; missing producer = test failure. + +**Traces to**: AC-1.4, AC-NEW-3 (every-payload-class-from-t=0 invariant) + +**Description**: spin up a minimal in-process test harness wiring all 14 components (mocked where heavy, real where light); drive 10 s of synthetic flight; assert the FDR contains records of every documented `kind` (estimate, vio_health, vpr_health, match_health, pose_estimate, source_label_change, fc_emit, fc_inbound, signing_key_event, spoof_promotion_block, mid_flight_tile, failed_tile_thumbnail, system_health, segment_rollover). + +**Input data**: synthetic 10 s flight. + +**Expected result**: every kind present with at least one record. + +**Max execution time**: 60 s. + +--- + +### C13-IT-02: smoothed past-keyframe records land in FDR (NOT in FC stream) + +**Summary**: when C5 publishes a smoothed past-keyframe revision, it lands in FDR but is NOT forwarded to C8's emission path. + +**Traces to**: AC-4.5 (revised) + +**Description**: per C5-IT-04 — re-run that scenario; assert (a) FDR contains the smoothed-history record class, (b) the C8 emission stream contains the original (unshifted) value. + +**Input data**: shared with C5-IT-04. + +**Expected result**: per assertion. + +**Max execution time**: 60 s. + +--- + +### C13-IT-03: AC-8.5 forensic-thumbnail-only enforcement + +**Summary**: C13 refuses to write a raw nav-cam or AI-cam frame; the only allowed exception is the failed-tile thumbnail at ≤0.1 Hz cap. + +**Traces to**: AC-8.5 + +**Description**: attempt to write an `FdrRecord` of `kind = "raw_nav_frame"`; assert C13 raises `RawFrameWriteForbiddenError`. Attempt `kind = "failed_tile_thumbnail"` at 0.05 Hz; assert accepted. Attempt the same kind at 0.5 Hz (above cap); assert rate-limited (drop with cap log). + +**Input data**: scripted producer. + +**Expected result**: per assertion. + +**Max execution time**: 30 s. + +--- + +### C13-IT-04: 64 GB cap holds without silent drop + +**Summary**: under synthetic 8 h replay producing > 64 GB worth of records, C13 enforces the cap via oldest-segment-dropped + always logs the rollover event. + +**Traces to**: AC-NEW-3 + +**Description**: scripted producer that emits at peak rates known to cross 64 GB in 8 h; replay; assert (a) total disk usage stays ≤ 64 GB, (b) every segment-rollover-with-drop is recorded with producer-id + dropped count, (c) the FlightFooter shows non-zero `records_dropped_overrun` matching the rollover events. + +**Input data**: synthetic high-rate producer. + +**Expected result**: cap held; every drop visible. + +**Max execution time**: 8 h on a Tier-1 runner (NFT-LIM-02 budget). + +--- + +### C13-IT-05: queue overrun produces a structured drop record (never silent) + +**Summary**: when a producer overruns its in-process queue, C13 writes a structured "overrun" record naming the producer and the dropped count. + +**Traces to**: AC-NEW-3 (no-silent-drop) + +**Description**: artificially throttle the writer thread; flood one producer's queue; assert (a) the first dropped record triggers an "overrun" record, (b) the overrun record includes producer-id + dropped count, (c) when throttling is removed, normal writes resume. + +**Input data**: scripted producer + writer-thread throttle harness. + +**Expected result**: overrun record present + accurate count. + +**Max execution time**: 60 s. + +--- + +### C13-IT-06: refuse takeoff if `open_flight` fails + +**Summary**: per AC-NEW-3, every payload class must be present from t=0 — if C13 cannot open the segment file, takeoff is aborted. + +**Traces to**: AC-NEW-3 + +**Description**: configure `flight_root` to a directory the process cannot write to; call `open_flight`; assert `FdrOpenError` raised; assert the calling code (compositional root) refuses to open the FC adapter. + +**Input data**: read-only `flight_root` directory. + +**Expected result**: takeoff aborts; error logged. + +**Max execution time**: 5 s. + +--- + +## Performance Tests + +### C13-PT-01: writer-thread throughput vs peak producer rate + +**Traces to**: AC-NEW-3 + +**Load scenario**: aggregated peak producer rate (~100 Hz combined records). + +**Expected results**: + +| Metric | Target | Failure Threshold | +|--------|--------|-------------------| +| Writer throughput | ≥ 200 Hz sustained | < 100 Hz | +| Per-record serialise + write p95 | ≤ 5 ms | 20 ms | + +--- + +## Security Tests + +### C13-ST-01: FDR record cannot be silenced via config + +**Summary**: there is no config flag that disables the spoofing-promotion-block, signing-key-rotation, or rollover-drop record kinds — they are mandatory per AC-NEW-3 + ADR-008. + +**Traces to**: defensive (AC-NEW-3, ADR-008) + +**Test procedure**: +1. Search the config schema for any flag that could disable any of those record kinds. +2. Assert no such flag exists. +3. Inject events of each kind under every documented config preset; assert all land in FDR. + +**Pass criteria**: no disabling flag found; all events land. +**Fail criteria**: any disabling flag exists or any event suppressed. + +--- + +## Acceptance Tests + +Covered transitively via FT-P-03 / FT-P-10 / FT-P-18 / NFT-LIM-02. + +--- + +## Test Data Management + +| Data Set | Source | Size | +|----------|--------|------| +| Synthetic 10 s flight harness | scripted | <10 MB | +| 8 h synthetic high-rate producer | scripted | runtime-generated | +| Read-only `flight_root` fixture | scripted | n/a (fs perms) | + +**Setup**: per-test `flight_root` under `tests/tmp/c13//`. +**Teardown**: drop tmp directory. +**Data isolation**: per-test `flight_root`. diff --git a/_docs/02_document/data_model.md b/_docs/02_document/data_model.md new file mode 100644 index 0000000..6e86a25 --- /dev/null +++ b/_docs/02_document/data_model.md @@ -0,0 +1,571 @@ +# Data Model + +> Date: 2026-05-09 (Plan Phase 2b — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 4 (Data Model Overview), § 5, § 7; `_docs/02_document/glossary.md`; `_docs/00_problem/{acceptance_criteria,restrictions}.md`; `_docs/01_solution/solution.md`; parent-suite `satellite-provider` schema (mirrored target for byte-identical post-landing upload). +> Scope: persistent and on-disk artifacts produced or consumed by the onboard companion, the operator-side pre-flight tooling (C12), the operator-side Tile Manager (C11 — pre-flight tile download + post-landing tile upload), and the FDR (C13). In-flight runtime DTOs (per-frame estimates, IMU windows, etc.) are summarised at the end of this document but are NOT persistent and NOT in the migration scope. + +--- + +## 1. Overview + +The onboard system is **mostly streaming / in-memory**: the per-frame pipeline (`NavCameraFrame → C1 + C2 → C2.5 → C3/C3.5 → C4 → C5 → C8`) operates entirely on in-process Python DTOs and never persists those frames or their derived per-frame estimates as relational rows. What IS persistent splits into four backing stores; the data model below is the union of all four. + +| Store | Technology | What lives here | Persistence scope | +|---|---|---|---| +| **Relational** | PostgreSQL 16 (mirror of `satellite-provider`'s schema; one DB per companion) | `tiles` (mirrored), `flights`, `sector_classifications`, `manifests`, `engine_cache_entries` | Across flights; survives reboot | +| **Filesystem (tile cache)** | Local NVM (`./tiles/{zoomLevel}/{x}/{y}.jpg` + `./tiles/{zoomLevel}/{x}/{y}.json` sidecar) | Tile JPEG bodies; per-tile metadata sidecar | Across flights; survives reboot | +| **Filesystem (artifacts)** | Local NVM | FAISS HNSW `.index`; TRT engines + INT8 calibration caches; camera calibration JSON; manifest JSON | Across flights; rebuildable from the relational tables + `satellite-provider` | +| **Filesystem (FDR)** | Local NVM, ≤ 64 GB ring per flight | `FdrRecord` segments; AC-8.5 ≤ 0.1 Hz failed-tile thumbnails | Per flight; rolling, oldest-segment-dropped on overrun | + +**Architecture-driven principles** (carried verbatim from `architecture.md` Principle list): + +1. **The persistent tile schema MUST stay byte-compatible with `satellite-provider`'s** so post-landing upload (D-PROJ-2) is a copy, not a transformation. Onboard-only fields are added as **additive columns**; the canonical Google-Maps-sourced columns must keep their semantics and types. +2. **No raw-frame storage** (AC-8.5). The only image persistence path is the orthorectified `Tile` (mid-flight or pre-flight). The single forensic exception is the AC-8.5 ≤ 0.1 Hz failed-tile thumbnail log inside the FDR. +3. **The camera calibration artifact is the ONLY way camera-specific math enters the system** (Principle #1). Test fixtures (`adti26.json`) and production deployments (`adti20.json`) load different artifacts on the same code path; the schema MUST treat `adti20` vs `adti26` as data, not as a code branch. +4. **No persistent secrets on the airborne companion image.** The MAVLink-2.0 signing key and the per-flight onboard tile-signing key are generated at takeoff load, logged to the FDR, and discarded with the FDR ring. No long-lived secrets table. +5. **Append-only / additive-only by default.** Migrations may add tables, columns, indexes, and CHECK constraints, but MUST NOT rename or drop existing columns without an ADR-recorded deprecation window. The `tiles` schema specifically is **frozen on the canonical columns** and only extensible via additive onboard-only columns. +6. **Honest covariance is a schema concern, not a tuning knob.** Every persisted estimate-derived structure (`TileQualityMetadata`, FDR-emitted-position records) carries the same 2×2 horizontal sub-matrix that drives AC-NEW-4 / AC-NEW-7. Lossy storage of covariance is a defect. + +**What is NOT in scope for this document**: + +- Per-frame in-memory DTOs (`VioOutput`, `VprQuery`, `VprResult`, `RerankResult`, `MatchResult`, `PoseEstimate`, `EmittedExternalPosition`) — they are listed as "non-persistent runtime entities" in § 5 below for completeness only; their schema lives in component specs (Step 3). +- The `satellite-provider`-side voting / trust schema (D-PROJ-2 design task #2). The onboard side writes `voting_status = pending` and reads only `voting_status = trusted` (or operator-overridden `pending`); the actual promotion logic is parent-suite work. +- The operator-workstation pre-flight tooling UI state (C12) — that is operator-side code with its own persistence story, out of the onboard companion's data scope. + +--- + +## 2. Persistent Entities + +### 2.1 `tiles` (PostgreSQL — MIRRORS `satellite-provider`; additive onboard columns) + +The tile is the single most important persistent entity. The schema deliberately starts from `satellite-provider`'s existing PostgreSQL `tiles` table (Google-Maps-sourced) and extends it with onboard-only columns. The **canonical columns** are read-and-writeable from both sides; the **onboard-only columns** are NULL on `satellite-provider`-sourced rows and populated only on companion-orthorectified rows. + +**Storage split**: row in PostgreSQL ↔ JPEG body on filesystem at `./tiles/{zoomLevel}/{x}/{y}.jpg`. The JPEG is byte-identical to what `satellite-provider` produces / consumes; the sidecar `./tiles/{zoomLevel}/{x}/{y}.json` carries the same row content as a JSON dump for the Tile Manager's batched HTTP form-data payload (D-PROJ-2 contract). + +#### 2.1.1 Columns + +| Column | Type | Origin | Constraints | Description | +|---|---|---|---|---| +| `id` | `bigserial` PK | both | NOT NULL | Surrogate primary key (consistent with `satellite-provider`). | +| `zoom_level` | `int` | both | NOT NULL, CHECK (`zoom_level` BETWEEN 10 AND 22) | Slippy/XYZ zoom; same semantics as `satellite-provider`. | +| `tile_x` | `int` | both | NOT NULL | Slippy/XYZ X index. | +| `tile_y` | `int` | both | NOT NULL | Slippy/XYZ Y index. | +| `latitude` | `double precision` | both | NOT NULL | Tile center latitude (WGS84). | +| `longitude` | `double precision` | both | NOT NULL | Tile center longitude (WGS84). | +| `tile_size_meters` | `double precision` | both | NOT NULL, CHECK (`tile_size_meters` > 0) | Ground footprint side length in metres (lat-adjusted m/px × pixels). | +| `tile_size_pixels` | `int` | both | NOT NULL, CHECK (`tile_size_pixels` > 0) | Pixel side length (square tiles). | +| `capture_timestamp` | `timestamptz` | both | NOT NULL | When the tile imagery was captured (Google Maps ingest time for `googlemaps`; UAV nav-camera frame timestamp for `onboard_ingest`). | +| `compression` | `text` | both | NOT NULL, default `'jpeg'` | Currently `jpeg` only; reserved for future formats. | +| `crs` | `text` | both | NOT NULL, default `'EPSG:3857'` | Spherical Mercator; same as `satellite-provider`. | +| `source` | `text` | both | NOT NULL, CHECK (`source` IN (`'googlemaps'`, `'onboard_ingest'`)) | New CHECK domain; existing `satellite-provider` rows pre-migration are backfilled `'googlemaps'`. | +| `flight_id` | `uuid` | onboard-only | NULL for `googlemaps`; NOT NULL for `onboard_ingest`; FK → `flights.id` | Which flight produced the tile (D-PROJ-2 design task #1 contract). | +| `companion_id` | `text` | onboard-only | NULL for `googlemaps`; NOT NULL for `onboard_ingest` | Deployed unit identifier; threat-model boundary marker. | +| `quality_metadata` | `jsonb` | onboard-only | NULL for `googlemaps`; NOT NULL for `onboard_ingest`; schema § 2.1.3 | Sufficient inputs for parent-suite voting (D-PROJ-2 #2). | +| `voting_status` | `text` | onboard-only | NULL for `googlemaps`; NOT NULL for `onboard_ingest`; CHECK (`voting_status` IN (`'pending'`, `'trusted'`, `'rejected'`)); default `'pending'` | Cache-poisoning safety budget gate (AC-NEW-7). The companion never writes `'trusted'`; promotion is parent-suite-side only. | +| `freshness_status` | `text` | both | NOT NULL, CHECK (`freshness_status` IN (`'fresh'`, `'stale_warn'`, `'stale_reject'`)); default `'fresh'` | Computed at ingest from `capture_timestamp` vs. the area's `SectorClassification` (active-conflict 6 mo / stable rear 12 mo); demoted on age regardless of `voting_status`. | +| `signature` | `bytea` | onboard-only | NULL for `googlemaps`; NOT NULL for `onboard_ingest` | Per-flight onboard signing key signature over the upload payload (D-PROJ-2 § Design task #1). | +| `created_at` | `timestamptz` | both | NOT NULL, default `now()` | Row creation time on the companion (or on `satellite-provider` for `googlemaps` rows). | +| `updated_at` | `timestamptz` | both | NOT NULL, default `now()` | Updated by trigger on `voting_status` / `freshness_status` change. | + +**Composite uniqueness**: `UNIQUE (zoom_level, tile_x, tile_y, source, flight_id)` — `(zoom_level, tile_x, tile_y)` is unique per `(source, flight_id)` tuple, so the same area can have one `googlemaps` row plus N `onboard_ingest` rows from different flights without collision. `flight_id` is treated as the empty UUID `'00000000-...-...-000000000000'` for `googlemaps` rows so the composite uniqueness still holds. + +**Indexes**: + +- B-tree on `(zoom_level, tile_x, tile_y)` — primary spatial lookup path for VPR retrieval and pre-flight cache hydration. +- B-tree on `(latitude, longitude)` — bounding-box queries for sector classification and spatial-coverage reports. +- B-tree on `voting_status` partial WHERE `source = 'onboard_ingest'` — operator-tooling queries for "which mid-flight tiles are still pending promotion?". +- B-tree on `flight_id` — FDR cross-reference; post-landing upload batching. +- B-tree on `created_at` — pruning / rollover queries. + +**Filesystem invariant**: for every row with `source = 'onboard_ingest'`, the JPEG body MUST exist at `./tiles/{zoom_level}/{tile_x}/{tile_y}.jpg` and a sidecar `./tiles/{zoom_level}/{tile_x}/{tile_y}.json` MUST exist with the row's full JSON form. The Tile Manager's `TileUploader` (C11) reads both and the row may be deleted from the local PostgreSQL only after `satellite-provider` returns 2xx for that `(flight_id, zoom_level, tile_x, tile_y)` tuple. + +#### 2.1.2 Lifecycle + +| Trigger | Action on `tiles` row | Action on filesystem | Action on FDR | +|---|---|---|---| +| Pre-flight cache hydration (C11 `TileDownloader` download from `satellite-provider`) | INSERT `source='googlemaps'`, `voting_status=NULL`, `freshness_status` computed | WRITE `./tiles/{z}/{x}/{y}.jpg` (atomic) | n/a (pre-flight) | +| Mid-flight orthorectify-and-store (in-air) | INSERT `source='onboard_ingest'`, `voting_status='pending'`, `quality_metadata` populated | WRITE `./tiles/{z}/{x}/{y}.jpg` (atomic, dedup by composite key) + sidecar JSON | EVENT `tile_emitted` with row PK | +| Freshness re-evaluation (background, periodic per AC-8.2 / AC-NEW-6) | UPDATE `freshness_status` | n/a | EVENT `freshness_demoted` if changed | +| Post-landing upload success (C11 `TileUploader`, op-workstation) | DELETE local row after `satellite-provider` 2xx | DELETE local JPEG + sidecar | EVENT `tile_uploaded` with `(flight_id, z, x, y)` and remote ack ID | +| Post-landing upload failure (retryable) | UPDATE `updated_at`; row retained for retry | retain JPEG + sidecar | EVENT `tile_upload_failed` with reason | + +**No DELETE in flight**: rows produced in flight (`source='onboard_ingest'`) MUST NOT be deleted while `flight_state == IN_AIR`. Defense-in-depth on Principle #4 (process-level isolation already prevents the upload code path from being loaded). + +#### 2.1.3 `quality_metadata` jsonb sub-schema + +For `source = 'onboard_ingest'` rows, `quality_metadata` carries the full input the parent-suite voting layer (D-PROJ-2 design task #2) needs without re-derivation. Schema is fixed; new keys are additive only. + +```jsonc +{ + "estimator_label": "satellite_anchored" | "visual_propagated" | "dead_reckoned", + "covariance_2x2": [[, ], [, ]], + "last_anchor_age_ms": = 0>, + "mre_px": = 0>, // reprojection error at the contributing match + "imu_bias_norm": = 0>, // VIO health proxy + "vio_strategy": "okvis2" | "vins_mono" | "klt_ransac", + "vpr_strategy": "ultra_vpr" | "mega_loc" | "mix_vpr" | "sela_vpr" | "eigen_places" | "net_vlad" | "salad", + "matcher": "disk_lightglue" | "aliked_lightglue" | "xfeat", + "adhop_invoked": , + "thermal_throttle_active": , // D-CROSS-LATENCY-1 hybrid was active at emit time + "build_kind": "deployment" | "research" // ADR-002 disambiguation; voting layer may down-weight research-binary tiles +} +``` + +The `tiles_quality_metadata_schema_v1` jsonschema lives at `_docs/02_document/schemas/tiles_quality_metadata_v1.json` and is referenced by both the local INSERT path and the C11 `TileUploader` payload validator. Schema versioning: bumping any field requires a new `*_schema_v2.json` and the row stamps the schema version into the jsonb under `_v` (additive-only). + +--- + +### 2.2 `flights` (PostgreSQL — onboard-only) + +A lightweight tracking row per flight, used by the FDR's manifest, the Tile Manager `TileUploader` batching, and the cache-poisoning safety audit (AC-NEW-7). + +| Column | Type | Constraints | Description | +|---|---|---|---| +| `id` | `uuid` PK | NOT NULL | Generated at takeoff load. | +| `companion_id` | `text` | NOT NULL | Same as `tiles.companion_id`. | +| `started_at` | `timestamptz` | NOT NULL, default `now()` | First MAVLink HEARTBEAT received post-takeoff. | +| `ended_at` | `timestamptz` | NULL until landing | Set on `flight_state` transition `IN_AIR → ON_GROUND`. | +| `signing_key_fingerprint` | `bytea` | NOT NULL | SHA-256 of the per-flight onboard signing pubkey; private key is never persisted. | +| `mavlink_signing_key_fingerprint` | `bytea` | NULL on iNav-only flights | SHA-256 of the per-flight MAVLink-2.0 signing key (D-C8-9 = (d), AP path only). | +| `fc_profile` | `text` | NOT NULL, CHECK (`fc_profile` IN (`'ardupilot_plane'`, `'inav'`)) | Which FC profile this flight ran. | +| `vio_strategy` | `text` | NOT NULL | Active C1 strategy this flight (ADR-001 — locked at startup). | +| `vpr_strategy` | `text` | NOT NULL | Active C2 strategy. | +| `build_kind` | `text` | NOT NULL, CHECK (`build_kind` IN (`'deployment'`, `'research'`)) | Disambiguates IT-12 research-binary flights from production flights. | +| `manifest_id` | `uuid` | NOT NULL, FK → `manifests.id` | Pre-flight manifest used for this flight (ties to model + calibration + corpus + sector). | +| `fdr_path` | `text` | NOT NULL | Local path to the FDR ring directory for this flight. | +| `created_at` | `timestamptz` | NOT NULL, default `now()` | | + +**Lifecycle**: row INSERTed at takeoff load; UPDATE on landing; never UPDATEd in flight after the takeoff-load row has been written. Row is never DELETED on the companion (the FDR retention policy decides physical removal). + +--- + +### 2.3 `sector_classifications` (PostgreSQL — operator-set, onboard-side cache) + +Mirrors operator-tooling C12's authoritative sector classification onto the companion so the freshness gate (AC-8.2 / AC-NEW-6) can be evaluated locally without a network call. + +| Column | Type | Constraints | Description | +|---|---|---|---| +| `id` | `bigserial` PK | NOT NULL | | +| `polygon_geojson` | `jsonb` | NOT NULL | GeoJSON polygon (WGS84) of the classified area. | +| `classification` | `text` | NOT NULL, CHECK (`classification` IN (`'active_conflict'`, `'stable_rear'`)) | Drives the freshness threshold. | +| `freshness_threshold_days` | `int` | NOT NULL, CHECK (`freshness_threshold_days` IN (180, 365)) | Materialized: 180 for `active_conflict`, 365 for `stable_rear`. | +| `set_by` | `text` | NOT NULL | Operator identifier from C12. | +| `set_at` | `timestamptz` | NOT NULL, default `now()` | | +| `revoked_at` | `timestamptz` | NULL until revoked | Only set when a polygon's classification is changed; the new row supersedes; the old row is retained for audit. | +| `manifest_id` | `uuid` | NOT NULL, FK → `manifests.id` | Which pre-flight manifest baked this classification. | + +**Indexes**: GiST on `polygon_geojson` for spatial containment queries; B-tree on `manifest_id`. + +**Note**: this table is the onboard cache of operator authority. The companion never *originates* a classification; it only consumes the rows the operator-workstation tooling (C12) staged into the manifest. A row mismatch between the manifest's expected classifications and the table is a content-hash gate failure (D-C10-3) → companion refuses takeoff. + +--- + +### 2.4 `manifests` (PostgreSQL — pre-flight idempotence gate) + +Implements D-C10-1 (pre-flight idempotence): the same input bundle (model + calibration + tile corpus + sector classifications) must produce a stable manifest hash. The manifest is the takeoff-load content-hash root; D-C10-3 (atomic write + content-hash gate) verifies it. + +| Column | Type | Constraints | Description | +|---|---|---|---| +| `id` | `uuid` PK | NOT NULL | | +| `manifest_hash` | `bytea` | NOT NULL, UNIQUE | SHA-256 over the canonicalized manifest contents. | +| `model_bundle_hash` | `bytea` | NOT NULL | SHA-256 over the (VIO + VPR + matcher + AdHoP) model bundle. | +| `calibration_artifact_hash` | `bytea` | NOT NULL | SHA-256 over the loaded camera calibration JSON. | +| `corpus_hash` | `bytea` | NOT NULL | SHA-256 over the sorted list of `(zoom_level, tile_x, tile_y, content_hash)` for `source='googlemaps'` rows in the manifest's spatial scope. | +| `sector_classifications_hash` | `bytea` | NOT NULL | SHA-256 over the sorted list of `sector_classifications` rows. | +| `engine_cache_bundle_hash` | `bytea` | NOT NULL | SHA-256 over the linked TRT engines + INT8 calibration caches. | +| `descriptor_index_hash` | `bytea` | NOT NULL | SHA-256 over the FAISS `.index` file. | +| `created_at` | `timestamptz` | NOT NULL, default `now()` | | +| `staged_by` | `text` | NOT NULL | Operator workstation identifier. | +| `verified_at` | `timestamptz` | NULL until verified | Set by C10 at takeoff load when the content-hash gate passes. | + +**Indexes**: UNIQUE on `manifest_hash`; B-tree on `created_at`. + +--- + +### 2.5 `engine_cache_entries` (PostgreSQL — TRT engine catalogue) + +Implements D-C10-7: TRT engines are keyed by the (SM, JetPack, TRT, precision) tuple so a deployed image with the wrong tuple cannot accidentally consume an engine compiled for a different SM. + +| Column | Type | Constraints | Description | +|---|---|---|---| +| `id` | `bigserial` PK | NOT NULL | | +| `model_name` | `text` | NOT NULL | Logical name (`disk`, `lightglue`, `ultra_vpr`, `aliked`, `xfeat`, …). | +| `model_revision` | `text` | NOT NULL | Upstream commit / weight revision pin. | +| `sm` | `text` | NOT NULL | Jetson SM (e.g., `87` for Orin). | +| `jetpack_version` | `text` | NOT NULL | E.g., `6.2`. | +| `tensorrt_version` | `text` | NOT NULL | E.g., `10.3`. | +| `precision` | `text` | NOT NULL, CHECK (`precision` IN (`'fp32'`, `'fp16'`, `'int8'`, `'int8_fp16_mixed'`)) | | +| `engine_path` | `text` | NOT NULL | Filesystem path to the `.engine` binary. | +| `engine_size_bytes` | `bigint` | NOT NULL | | +| `engine_hash` | `bytea` | NOT NULL | SHA-256 over the engine file. | +| `int8_calibration_path` | `text` | NULL for non-INT8 | Filesystem path to the INT8 calibration cache. | +| `int8_calibration_hash` | `bytea` | NULL for non-INT8 | SHA-256 over the calibration cache. | +| `created_at` | `timestamptz` | NOT NULL, default `now()` | | + +**Composite uniqueness**: `UNIQUE (model_name, model_revision, sm, jetpack_version, tensorrt_version, precision)`. + +**Filesystem invariant**: `engine_path` and `int8_calibration_path` (when set) MUST be valid local files; their on-disk SHA-256 MUST match `engine_hash` / `int8_calibration_hash`. C7 verifies the hash on engine load; mismatch ⇒ takeoff refused. + +--- + +### 2.6 Camera calibration artifact (filesystem JSON, NOT in PostgreSQL) + +Per Principle #1, camera-specific math enters only via this file. It is a single JSON document loaded once at startup. It is **not** stored in PostgreSQL because it is treated as an *image* of the camera (test fixture vs. production unit) rather than a queryable record — different binaries / different fixtures point at different files via config, and the schema is the same on every code path. + +**Path convention**: + +- Production: `/etc/gps-denied/calibration/adti20.json` (per-deployed-unit, post D-PROJ-1 hybrid) +- Test fixtures: `tests/fixtures/calibration/adti26.json` + +**Schema** (camera_calibration_v1): + +```jsonc +{ + "schema_version": 1, + "camera_name": "adti20" | "adti26" | , + "model": "ADTi 20MP 20L V1" | , + "sensor": { + "width_px": 5472, + "height_px": 3648, + "pixel_size_um": 3.76, + "sensor_size_mm": [23.6, 15.7] + }, + "intrinsics": { + "fx_px": , "fy_px": , + "cx_px": , "cy_px": + }, + "distortion": { + "model": "opencv_radtan" | "opencv_kannala_brandt", + "coefficients": [, ...] + }, + "body_to_camera": { + "rotation_quaternion_xyzw": [, , , ], + "translation_m": [, , ] + }, + "acquisition_method": "factory_sheet" | "checkerboard_refined" | "hybrid", + "acquisition_timestamp": , + "acquisition_notes": , + "content_hash": +} +``` + +The artifact's `content_hash` is the value referenced by `manifests.calibration_artifact_hash`. + +--- + +### 2.7 FAISS HNSW descriptor index (filesystem binary, NOT in PostgreSQL) + +VPR descriptors are stored in a FAISS-native `.index` file. This is the on-disk form of `VprQuery` candidates' lookup index, not a queryable relational entity. Implementation details: + +- Path: `/var/lib/gps-denied/descriptors/{vpr_strategy}_{model_revision}.index` +- Atomic write: per D-C10-3 — write to `*.index.tmp`, fsync, rename via `atomicwrites`. +- Content-hash gate: SHA-256 over the file body, recorded in `manifests.descriptor_index_hash`. +- Companion of the `.index` is a `*.meta.json` carrying `{tile_id → descriptor_offset}` for the rows under the manifest's spatial scope. The mapping uses `tiles.id` PKs, so a tile pruned from the relational table invalidates the offset and forces a re-build of the index. + +The descriptor index is **rebuildable** from `(tiles, model_bundle)` + a deterministic VPR feature-extraction pass; therefore it is not part of any backup / DR target — losing the `.index` re-runs C10 descriptor generation on the operator workstation pre-flight. + +--- + +### 2.8 Flight Data Recorder (`FdrRecord`, filesystem ring, NOT in PostgreSQL) + +The FDR is a per-flight ≤ 64 GB local ring buffer of structured records. It is **append-only** while the flight is `IN_AIR`; rollover (oldest segment dropped first) is logged but never silent (AC-NEW-3). + +**Layout** (per flight): + +``` +/var/lib/gps-denied/fdr/{flight_id}/ + manifest.json # flight metadata snapshot (mirror of flights row + manifests row) + segments/ + seg_00001.bin # rolling segments (~256 MB each) + seg_00002.bin + ... + thumbnails/ # AC-8.5 ≤ 0.1 Hz failed-tile thumbnail log (forensic exception) + YYYYMMDD-HHMMSS-.jpg + rollover.log # one line per dropped segment +``` + +**Record schema (`FdrRecord`, length-prefixed binary stream)**: + +``` +record_header (16 bytes): + magic u32 = 0x47464452 # "GFDR" + version u16 = 1 + type u16 # see record types below + monotonic_ms u64 # since flight start +record_body: variable, schema per `type` +record_crc32 u32 +``` + +**Record types (`type` field — additive-only)**: + +| Type ID | Name | Body | +|---|---|---| +| `0x0001` | `EmittedExternalPosition` | WGS84 + 6×6 covariance + provenance label + `last_satellite_anchor_age_ms` + per-FC encoding (AP `GPS_INPUT` or iNav `MSP2_SENSOR_GPS`) | +| `0x0002` | `ImuTrace` | timestamped accel + gyro window | +| `0x0003` | `ReceivedMavlinkRaw` | raw MAVLink frame (signed/unsigned), captured `tlog`-style | +| `0x0004` | `ReceivedMsp2Raw` | raw MSP2 frame (iNav profile only) | +| `0x0005` | `SystemHealth` | CPU%, GPU%, temp, throttle flag, RAM, VRAM, NVM remaining | +| `0x0006` | `SourceLabelTransition` | `{satellite_anchored, visual_propagated, dead_reckoned}` transition | +| `0x0007` | `MidFlightTileEmitted` | reference to `tiles.id` + `quality_metadata` snapshot | +| `0x0008` | `MidFlightTileFailed` | reason + thumbnail filename (AC-8.5 forensic exception) | +| `0x0009` | `MavlinkSigningKeyRotated` | new key fingerprint + reason (D-C8-9) | +| `0x000A` | `EkfSourceSetCommand` | D-C8-2 source-set switch event | +| `0x000B` | `VisualBlackoutEvent` | start / end + reason (AC-3.5, AC-NEW-8) | +| `0x000C` | `SpoofingPromotionEvent` | promoted / rejected + 10 s-gate state (AC-NEW-2, AC-NEW-8) | +| `0x000D` | `ContentHashGateFail` | takeoff-load gate fail (D-C10-3) | +| `0x000E` | `ThermalThrottleHybridSwitch` | K=3 ↔ K=2 switch event (D-CROSS-LATENCY-1, ADR-006) | +| `0x000F` | `ComponentLifecycleEvent` | per-component start / stop / fail | + +**Backward compatibility**: new record types are appended; readers MUST skip records they don't recognise (the `record_header` length is enough to advance the cursor). No record type is ever renumbered or removed; deprecation is by ceasing to emit. + +**Retention**: per-flight ring; on `IN_AIR → ON_GROUND` transition, the ring is sealed and the operator-tooling FDR-retrieval workflow (C12) copies it off the companion. The companion auto-prunes flights older than the configured retention window (default: 30 days) — the prune log itself is its own FDR record on the next flight. + +--- + +### 2.9 Tile JPEG bodies (filesystem) + +JPEG bodies live at `./tiles/{zoomLevel}/{x}/{y}.jpg`. A sidecar `./tiles/{zoomLevel}/{x}/{y}.json` carries the full row content for upload-time payload assembly. Both files are atomic-written (via `atomicwrites`); both are removed only after the corresponding `tiles` row's lifecycle says it is safe (see § 2.1.2). Filesystem and PostgreSQL drift is treated as a defect: the operator-tooling C12 has a periodic `consistency_audit` that reports any orphan files / missing files. + +--- + +## 3. Mermaid ERD + +```mermaid +erDiagram + flights ||--o{ tiles : "1 flight produces N onboard tiles" + flights ||--|| manifests : "1 flight references 1 manifest" + manifests ||--o{ sector_classifications : "manifest captures N classifications" + manifests ||--o{ engine_cache_entries : "manifest pins N engines" + sector_classifications ||--o{ tiles : "polygon contains N tiles (spatial)" + + flights { + uuid id PK + text companion_id + timestamptz started_at + timestamptz ended_at + bytea signing_key_fingerprint + bytea mavlink_signing_key_fingerprint + text fc_profile + text vio_strategy + text vpr_strategy + text build_kind + uuid manifest_id FK + text fdr_path + timestamptz created_at + } + + tiles { + bigserial id PK + int zoom_level + int tile_x + int tile_y + double latitude + double longitude + double tile_size_meters + int tile_size_pixels + timestamptz capture_timestamp + text compression + text crs + text source + uuid flight_id FK + text companion_id + jsonb quality_metadata + text voting_status + text freshness_status + bytea signature + timestamptz created_at + timestamptz updated_at + } + + sector_classifications { + bigserial id PK + jsonb polygon_geojson + text classification + int freshness_threshold_days + text set_by + timestamptz set_at + timestamptz revoked_at + uuid manifest_id FK + } + + manifests { + uuid id PK + bytea manifest_hash + bytea model_bundle_hash + bytea calibration_artifact_hash + bytea corpus_hash + bytea sector_classifications_hash + bytea engine_cache_bundle_hash + bytea descriptor_index_hash + timestamptz created_at + text staged_by + timestamptz verified_at + } + + engine_cache_entries { + bigserial id PK + text model_name + text model_revision + text sm + text jetpack_version + text tensorrt_version + text precision + text engine_path + bigint engine_size_bytes + bytea engine_hash + text int8_calibration_path + bytea int8_calibration_hash + timestamptz created_at + } +``` + +> Filesystem-only artifacts (camera calibration JSON, FAISS `.index`, FDR records, tile JPEG bodies) are NOT shown in the ERD. Their referential ties to PostgreSQL are: `manifests.calibration_artifact_hash` ↔ calibration JSON; `manifests.descriptor_index_hash` ↔ FAISS `.index`; `flights.fdr_path` ↔ FDR ring directory; `tiles.id` × `(zoom_level, tile_x, tile_y)` ↔ tile JPEG body + sidecar. + +--- + +## 4. Migration Strategy + +### 4.1 Versioning tool + +**Alembic** (SQLAlchemy-based migrations) is the chosen tool. Justification: + +- Python is the host language (per `architecture.md` § 2). Alembic ships with the SQLAlchemy ecosystem the project is already using for any DB-touching code. +- Alembic supports both autogenerate and hand-written migrations; the team will use **hand-written migrations only** (autogenerate is acceptable as a starting point for a migration but must be human-reviewed before commit — autogenerate cannot infer the additive-only / freeze-canonical rules). +- Alembic's `downgrade()` direction satisfies the reversibility requirement below for additive migrations; non-reversible migrations (rare; only for the controlled-deprecation cases) are explicitly marked. +- Alembic plays well with the per-companion local PostgreSQL deployment model — no shared upstream migration coordinator is needed. + +**Coordination with `satellite-provider`**: the parent suite owns the canonical `satellite-provider` schema (whatever migration tool — likely EF Core — they use is irrelevant to the onboard side). The onboard project's `tiles` migration baseline MUST be cross-checked against the latest `satellite-provider` `tiles` schema at every onboard cycle; the onboard-only columns are added on top. A Plan-phase carryforward (Step 4 risk register) captures the ongoing coordination obligation. + +### 4.2 Reversibility requirement + +**Default = reversible.** Every migration MUST implement a working `downgrade()` that returns the schema to the prior state without data loss. Migrations that cannot be reversed (e.g., DROP COLUMN) are **forbidden by default** and require an ADR + user sign-off as part of the migration commit. + +Concretely: + +- `op.add_column(...)` ⇄ `op.drop_column(...)` — reversible. +- `op.create_table(...)` ⇄ `op.drop_table(...)` — reversible (data loss is intentional for new tables). +- `op.create_index(...)` ⇄ `op.drop_index(...)` — reversible. +- `op.alter_column(...)` for type changes — only reversible if the type widening is truly bidirectional; otherwise this is a forbidden-by-default migration that requires ADR. +- Backfills / data migrations — must be implemented as either (a) idempotent (safe to re-run on either direction) or (b) split into a separate "data-only" migration whose reverse is a no-op with a documented data-loss note. + +### 4.3 Naming convention + +``` +migrations/versions/YYYY_MM_DD_HHMM_.py +``` + +Where: + +- `YYYY_MM_DD_HHMM` is the migration's authored timestamp in UTC (matches Alembic's natural ordering and is operator-friendly). +- `` is kebab-case-with-underscores describing the change in ≤ 6 words: `add_voting_status_to_tiles`, `create_engine_cache_entries`, `freshness_threshold_check`, etc. +- The Alembic `revision` and `down_revision` IDs are 12-character random hex (Alembic default). + +`migrations/env.py` uses the `gps_denied.db.metadata` SQLAlchemy `MetaData` object for autogenerate diffs; the connection URL is read from the same config file the runtime uses (no separate migration config). + +### 4.4 Migration baseline + +The first migration `0001_initial.py` creates `flights`, `manifests`, `sector_classifications`, `engine_cache_entries`, and `tiles` (with the canonical `satellite-provider` columns). The first onboard-extension migration `0002_add_onboard_columns_to_tiles.py` adds the onboard-only columns (`flight_id`, `companion_id`, `quality_metadata`, `voting_status`, `freshness_status`, `signature`) plus the indexes specific to those columns. Splitting the baseline this way keeps the diff against `satellite-provider`'s migration history mechanically clear. + +--- + +## 5. Seed Data + +| Table | dev-tier1 (Tier-1 Docker) | staging-tier1 / staging-tier2 (CI) | production | +|---|---|---|---| +| `flights` | One synthetic flight row keyed against `tests/fixtures/flight_derkachi/` | Synthetic flights per replay scenario; matches IT-12 comparative-study fixture set | Real flight data — written by takeoff-load code; NEVER seeded | +| `tiles` (`source='googlemaps'`) | Loaded from `tests/fixtures/tiles_corpus/` (a curated, ≤100 MB subset of `satellite-provider`'s output for the Derkachi area) | Same as dev-tier1; the e2e-test `mock-suite-sat-service` fixture populates additional rows on demand for upload-side scenarios | Written by C11 `TileDownloader` from the operator workstation; NEVER seeded | +| `tiles` (`source='onboard_ingest'`) | Synthetic injected for IT / NFT replays | Synthetic injected for NFT-SEC-01 cache-poisoning Monte-Carlo runs | NEVER seeded (only takeoff-load orthorectification code can write) | +| `sector_classifications` | One `active_conflict` polygon over Derkachi + one `stable_rear` polygon (test fixtures) | Same | Loaded by C10 from operator-staged manifest; NEVER seeded | +| `manifests` | One pre-built manifest for the Derkachi fixture set | Multiple — one per CI scenario | Loaded by C10 from operator-staged manifest; NEVER seeded | +| `engine_cache_entries` | Tier-1 has no GPU engines; rows reference Tier-2-built engines pulled from a CI artifact cache | Tier-2 builds engines on first run; rows generated then | Loaded by C10 from operator-staged engine cache; NEVER seeded | + +**Hard rule**: production NEVER seeds any of these tables. The pre-flight cache build flow (C11 `TileDownloader` writes `tiles` rows; C10 writes `manifests` and `engine_cache_entries`) is the only writer of canonical pre-flight rows; the takeoff-load flow is the only writer of flight rows; the in-flight orthorectifier is the only writer of `source='onboard_ingest'` rows. A test fixture writing into a production database is a defect (per coderule.mdc "mocking data is needed only for tests, never mock data for dev or prod env"). + +**Seed mechanism (Tier-1 / staging)**: `tests/fixtures/seed_db.py` is the canonical Tier-1 seeder; it issues only INSERTs against a dedicated test database. It is invoked by the test harness (`pytest`) and by the `docker compose` developer setup. It does NOT use Alembic — it operates on the schema Alembic has already created. + +--- + +## 6. Backward Compatibility + +### 6.1 Default: additive-only + +Every schema change is **additive by default**: + +- New tables — additive; existing code paths unaffected. +- New columns — additive; default value or NULL; existing rows backfilled per migration data step. +- New indexes — additive; not visible to read-paths semantically, only to performance. +- New CHECK constraints — additive **only if** all existing rows already satisfy the constraint; otherwise the migration MUST split into "backfill rows" then "add constraint". +- New jsonb keys (e.g., in `tiles.quality_metadata`) — additive; readers MUST tolerate missing keys (treat as absent / null). +- New FDR record types — additive; readers skip unknown types. + +### 6.2 Forbidden by default + +The following changes are **forbidden by default** and require an ADR + user sign-off: + +- Renaming a column or table. +- Dropping a column or table. +- Tightening a column type (narrowing). +- Tightening a CHECK constraint that existing rows could violate. +- Removing a value from a CHECK constraint enum. + +This rule is the data-model expression of `coderule.mdc`'s "Do not rename any databases or tables or table columns without confirmation." + +### 6.3 The `tiles` schema specifically: frozen on canonical columns + +Because the `tiles` schema MUST stay byte-compatible with `satellite-provider`'s on-disk and ingest format, the **canonical columns** (those marked "both" in § 2.1.1) are **frozen**: this project never renames, types-narrows, or drops them. Coordination with the parent suite is the only path to canonical-column changes; the onboard-only columns can evolve under the additive-only rule above without parent-suite coordination. + +If `satellite-provider` introduces a canonical-column change in a future cycle, the onboard project must: + +1. Open an ADR documenting the upstream change. +2. Run a migration that mirrors the upstream change exactly. +3. Re-validate that the post-landing upload payload (D-PROJ-2 contract) still matches. +4. Re-run the post-landing upload integration test (`mock-suite-sat-service` swapped for the real `satellite-provider`). + +### 6.4 Schema versioning for jsonb sub-schemas + +`tiles.quality_metadata` (and any future jsonb fields with non-trivial structure) carries a `_v` integer key. Readers branch on `_v`: + +- `_v == 1` (current): the schema in § 2.1.3. +- `_v` missing: treat as `_v == 1` (back-compat for rows written before `_v` was added). +- `_v == 2+`: future additive evolution; readers MUST tolerate unknown additional keys. + +Schema-version bumps are tracked in `_docs/02_document/schemas/` (a new `tiles_quality_metadata_v{N}.json` per bump). + +### 6.5 FDR file-format compatibility + +The FDR `record_header` is fixed at version 1. Every FDR reader (operator-tooling, replay tools) MUST: + +- Validate `magic == 0x47464452` and skip a corrupt segment. +- Read the `version` field; on `version != 1`, refuse to interpret the body and emit a "unknown FDR version" diagnostic. +- For known `version == 1`, read the `type` field; on unknown `type`, advance the cursor by `body_length + 4` (CRC) and continue. + +The FDR file format will be bumped to version 2 only if a structural change to `record_header` is required; that change is gated on the same ADR + user sign-off rule as a forbidden-by-default schema change. + +--- + +## 7. Non-Persistent Runtime Entities (referenced for completeness only) + +The following DTOs flow through the per-frame pipeline in memory and are **NOT** persistent (except as derived FDR records — see § 2.8 above). They are documented here because § 4 of `architecture.md` lists them as "Core entities" — but the data-model migration scope does not include them. Their full schema lives in component specs (Plan Step 3). + +| DTO | Producer | Consumers | Persisted as | +|---|---|---|---| +| `NavCameraFrame` | Camera ingest thread | C1, C2 | Never (AC-8.5 forbids raw-frame storage; the only forensic exception is AC-8.5 ≤ 0.1 Hz failed-tile thumbnails inside the FDR) | +| `ImuSample` / `ImuWindow` | C8 inbound side | C1, C5 | FDR `ImuTrace` records (lossy summary, not full stream) | +| `VioOutput` | C1 | C5 | FDR `SystemHealth` (covariance summary only); not as a row | +| `VprQuery` / `VprResult` | C2 | C2.5 | Never (transient retrieval state) | +| `RerankResult` | C2.5 | C3 / C3.5 | Never | +| `MatchResult` | C3 / C3.5 | C4 | Never | +| `PoseEstimate` | C4 → C5 | C8, C13 | FDR `EmittedExternalPosition` records (post-emission); also feeds `tiles.quality_metadata` for in-flight orthorectified tiles | +| `EmittedExternalPosition` | C8 | FC, FDR | FDR record | +| `FlightStateSignal` | C8 inbound side | flight-state guard, FDR | FDR `ComponentLifecycleEvent` on transition | +| `CameraCalibration` (loaded once) | calibration loader | C1, C3, C4 | NOT in PostgreSQL — see § 2.6 | + +--- + +## 8. Open Items / Plan-Phase Carryforward + +- **`satellite-provider` schema-drift coordination** (recurring): the canonical-columns freeze depends on an explicit cross-project schema review every cycle. Captured as a Plan-Step-4 risk register entry; carryforward. +- **D-PROJ-2 #1 ingest-endpoint contract**: the `signature` column's exact algorithm (Ed25519 vs ECDSA) and the per-flight key distribution is a parent-suite design decision; onboard side is contract-flexible and treats `signature` as opaque `bytea`. +- **D-PROJ-2 #2 voting-layer schema**: parent-suite-side; this onboard data model writes `voting_status='pending'` and reads `'trusted'` only — the actual promotion table lives in `satellite-provider`'s schema and is out of scope here. +- **GeoJSON polygon precision** (`sector_classifications.polygon_geojson`): GeoJSON is precision-bounded by JSON number representation; if AC-NEW-7 cache-poisoning safety needs sub-metre polygon edges, a future migration can switch to PostGIS `geography(Polygon, 4326)`. Captured as carryforward (currently no AC requirement to do so). +- **FDR retention policy default**: 30 days post-landing is a reasonable default but is not pinned in any AC; carryforward to the operator-tooling spec (C12) for confirmation. diff --git a/_docs/02_document/deployment/ci_cd_pipeline.md b/_docs/02_document/deployment/ci_cd_pipeline.md new file mode 100644 index 0000000..8128ca6 --- /dev/null +++ b/_docs/02_document/deployment/ci_cd_pipeline.md @@ -0,0 +1,194 @@ +# GPS-Denied Onboard — CI/CD Pipeline + +> Date: 2026-05-09 (Plan Phase 2c — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 3 (Deployment Model); ADR-002 (build-time exclusion); ADR-005 (Tier-1 / Tier-2 are first-class); ADR-007 (`mock-suite-sat-service` is an e2e-test fixture; reversed 2026-05-09 from the earlier "real component boundary" framing). + +## Pipeline Overview + +The pipeline has **two execution tiers** (architecture.md ADR-005), reflected in two CI runner pools that share the same workflow definitions but differ in runner labels and active job set: + +| Stage | Trigger | Runner | Quality Gate | +|-------|---------|--------|-------------| +| Lint | Every push, every PR | Tier-1 (GitHub-hosted x86_64) | Zero lint errors (Python: `ruff` + `mypy --strict`; C++: `clang-format --dry-run` + `clang-tidy`; CMake: `cmakelang`) | +| Unit | Every push, every PR | Tier-1 | All unit tests pass; coverage ≥ 75 % per component, ≥ 90 % on safety-critical (C5 state estimator, C8 FC adapters) | +| Integration (Tier-1) | Every push, every PR | Tier-1 | Tier-1 integration suite passes (uses `docker-compose.test.yml` — companion + mock-sat + db + e2e-runner) | +| Build (Tier-1, both binaries) | Every push, every PR | Tier-1 | `companion-tier1:deployment-` AND `companion-tier1:research-` build green (ADR-002 dual-emit) | +| SBOM diff | After build | Tier-1 | Deployment SBOM excludes `vins_mono`, `salad`, etc.; research SBOM includes all strategies; PR fails on mismatch | +| Security | After build | Tier-1 | Zero unpatched critical / high CVEs (`pip-audit` + `dotnet list package --vulnerable` for mock-sat + Trivy on images) | +| Push images (Tier-1) | PR merge to `dev`, `stage`, `main` | Tier-1 | Push succeeds; PRs do NOT push (avoids polluting registry) | +| Build (Tier-2 deployment binary) | PR merge to `dev`, `stage`, `main` | Tier-2 (self-hosted Jetson) | Native build on Jetson green; deployment binary SBOM matches Tier-1 deployment SBOM | +| AC-bound NFTs (Tier-2) | PR merge to `dev`, `stage`, `main`; manual on PR | Tier-2 | NFT-PERF-* (AC-4.1, AC-NEW-1, AC-NEW-2), NFT-LIM-* (AC-4.2, AC-NEW-3), NFT-RES-* (AC-NEW-4, AC-NEW-7), IT-12 (comparative study) all pass thresholds in `tests/traceability-matrix.md` | +| JetPack image build | Tag on `main` | Tier-2 | JetPack 6.2 image built with deployment binary preinstalled, signed, and attested | +| Operator tooling tarball | Tag on `main` | Tier-1 | Tarball contains C11 Tile Manager (both `TileDownloader` and `TileUploader`) + C12 Operator Pre-flight Tooling + mock-sat-service compose + verification script | + +Tier-2 jobs are the **only** AC-bound jobs. Everything else runs on Tier-1. + +## Stage Details + +### Lint + +Parallelized per language inside one Tier-1 workflow. Sequential per file is preserved in the report so a single failure is greppable in the log. + +| Language | Tool | Rules | +|---|---|---| +| Python | `ruff` (formatter + linter) | Project's `pyproject.toml` configures rules; `ruff check --diff` enforces that the committed code is formatted | +| Python types | `mypy --strict` | Strict mode; all components must type-check (CI fails on `error: ...`) | +| C++ | `clang-format --dry-run` + `clang-tidy` | `.clang-format` lives at repo root; `clang-tidy` checks listed in `.clang-tidy` | +| CMake | `cmakelang` (`cmake-format --check`) | `.cmake-format.yaml` lives at repo root | +| YAML / Markdown | `yamllint`, `markdownlint-cli` | Used for `.github/`, `_docs/`, `docker-compose*.yml` | + +### Unit + +| Component | Framework | Coverage gate | +|---|---|---| +| Python (host code) | `pytest` + `pytest-cov` | `--cov-fail-under=75` per component; safety-critical (C5, C8) at `--cov-fail-under=90` | +| C++ (per-strategy native builds) | `gtest` + `lcov` | Per-strategy library `≥ 75 %` line coverage; `klt_ransac` (mandatory simple-baseline) at `≥ 90 %` | +| Mock sat service (.NET) | `dotnet test` + `coverlet` | `≥ 75 %` line coverage on the mock | + +Coverage report is published as a pipeline artifact (`coverage/index.html`). CI fails fast on threshold violation. + +### Integration (Tier-1) + +Drives the autodev e2e contract: runs `docker compose -f docker-compose.test.yml up --abort-on-container-exit --exit-code-from e2e-runner --build` from `e2e/` and captures `e2e/results/report.csv`. + +Coverage scenarios on Tier-1: + +- All FT (Functional Test) and IT (Integration Test) scenarios that DO NOT require Jetson hardware (per `tests/traceability-matrix.md` "Tier" column). +- `mock-suite-sat-service` interactions including failure injection (latency, 5xx, partial responses, cache poisoning replay). +- Cross-FC adapter behavior on SITL: ArduPilot Plane SITL runs as a sidecar container; iNav SITL runs as a sidecar container; companion's MAVLink and MSP2 paths are exercised against both. +- D-PROJ-2 contract: post-landing upload payload assembly + signature verification against the mock. + +### Build (Tier-1, both binaries) + +Per ADR-002, every PR produces both binaries. The build job uses two parallel matrix entries with identical Dockerfile + different `BUILD_*` flags: + +```yaml +matrix: + build_kind: + - { tag: deployment, args: "BUILD_VINS_MONO=OFF BUILD_SALAD=OFF" } + - { tag: research, args: "BUILD_VINS_MONO=ON BUILD_SALAD=ON" } +``` + +The Dockerfile receives the args; `cmake -DBUILD_VINS_MONO=$BUILD_VINS_MONO -DBUILD_SALAD=$BUILD_SALAD` enforces the exclusion at the C++ build layer; `setup.py` / `pyproject.toml` reads the same env to skip importing excluded modules in the composition root validator. Both images are built; both must build green; both go through SBOM and security gates. + +### SBOM diff (ADR-002 enforcement) + +```yaml +- name: sbom-deployment + run: syft packages docker:gps-denied/companion-tier1:deployment-${{ github.sha }} -o spdx-json > sbom-deployment.json + +- name: sbom-research + run: syft packages docker:gps-denied/companion-tier1:research-${{ github.sha }} -o spdx-json > sbom-research.json + +- name: sbom-diff + run: python ci/sbom_diff.py --deployment sbom-deployment.json --research sbom-research.json +``` + +`ci/sbom_diff.py` enforces: + +- `vins_mono`, `salad`, and any module flagged "research-only" in `_docs/02_document/components/` MUST appear in research SBOM and MUST NOT appear in deployment SBOM. +- The deployment SBOM is a strict subset of the research SBOM (i.e., the research binary contains everything the deployment binary contains plus the research-only modules). +- Both SBOMs are attached as workflow artifacts and as release artifacts on tag. + +### Security + +| Check | Tool | Block on | +|---|---|---| +| Python dependency CVEs | `pip-audit` against `pyproject.toml` lockfile | Critical / High severity | +| .NET dependency CVEs | `dotnet list package --vulnerable --include-transitive` | Critical / High severity | +| C++ dependency CVEs | Manual audit via SBOM matched against NVD; `osv-scanner` for known submodule pins | Critical / High severity | +| Image scan | Trivy on all CI-built images | Critical / High severity | +| OpenCV pin gate | CI step asserts the resolved OpenCV version is `≥ 4.12.0` (D-CROSS-CVE-1) | Any version `< 4.12.0` | +| GTSAM CVE re-scan | Monthly scheduled workflow against the GTSAM commit pinned in `cmake/dependencies.cmake` | Any new published CVE | + +### Push images (Tier-1) + +On `push` to `dev`, `stage`, `main`: tag images with `${BRANCH_NAME}-${BUILD_KIND}-${SHORT_SHA}` and push to the registry. PR events do NOT push — PRs get test signal only. + +### Build (Tier-2 deployment binary) + +Self-hosted Jetson runner (`labels: [self-hosted, jetson, orin-nano-super]`) builds the deployment binary natively. The build is **not** containerized (architecture.md § 3 explanation). After build: + +1. Compute the deployment-binary SBOM on Jetson. +2. Compare it byte-for-byte (after canonicalization) against the Tier-1 deployment-binary SBOM. If they diverge, the PR fails — the two binaries must be built from the same source / same dependency pins. +3. Cache the TRT engine builds on the Jetson runner's persistent cache (keyed by manifest hash) so subsequent CI runs reuse them. + +### AC-bound NFTs (Tier-2) + +Run only on the Tier-2 runner. Each NFT corresponds to one or more acceptance-criterion entries in `tests/traceability-matrix.md`. The runner: + +1. Pulls the freshly-built deployment binary. +2. Mounts the curated `tests/fixtures/flight_derkachi/` replay corpus. +3. Runs each NFT scenario, captures jetson-stats telemetry (CPU, GPU, temp, throttle, RAM, VRAM), and compares against the AC threshold. +4. Publishes a per-NFT report; pipeline fails if any threshold is missed. + +| NFT scenario | AC | Pass criterion | +|---|---|---| +| NFT-PERF-01 | AC-4.1 | E2E p95 ≤ 400 ms over 1000-frame replay (steady state) | +| NFT-PERF-02 | AC-4.4 | No frame batching detected (per-frame emit gap < 50 ms) | +| NFT-PERF-03 | AC-NEW-1 | Cold-start TTFF p95 < 30 s over 50 cold boots | +| NFT-PERF-04 | AC-NEW-2 | Spoofing-promotion latency p95 < 3 s on AP SITL + iNav SITL | +| NFT-LIM-01 | AC-4.2 | Memory < 8 GB shared (CPU + GPU) over 8 h replay | +| NFT-LIM-02 | AC-NEW-3 | FDR ring stays ≤ 64 GB; no silent drops | +| NFT-LIM-04 | AC-NEW-5 | Workstation thermal-baseline (chamber test deferred) | +| NFT-RES-03 | AC-NEW-4 | Monte Carlo: P(err > 500 m) < 0.1 %, P(err > 1 km) < 0.01 %, with stated 95 % CI | +| NFT-RES-04 | AC-NEW-8 | VISUAL_BLACKOUT mode transition ≤ 400 ms; covariance grows monotonically | +| NFT-SEC-01 | AC-NEW-7 | Cache-poisoning Monte Carlo on onboard side: P(misalign > 30 m) < 1 %, P(> 100 m) < 0.1 %, with 95 % CI | +| NFT-SEC-03 | D-C8-9 | MAVLink 2.0 signing handshake exercised; per-flight rotation logged to FDR | +| NFT-SEC-05 | architecture.md Threat Model | Network-egress-deny on production profile validated (DNS blackhole + iptables OUTPUT REJECT effective) | +| NFT-9 hot-soak | AC-NEW-5 + AC-4.1 | 8 h at +50 °C ambient (chamber if available, else throttle-injection): p95 ≤ 400 ms throughout | +| NFT-10 SBOM CVE audit | D-CROSS-CVE-1 | SBOM clean of unpatched CVEs at audit time; failed scans blocking | +| IT-12 | architecture.md ADR-001 + ADR-002 | Comparative study replays the same fixture against research-binary's all-VIO matrix; report published | + +### JetPack image build (release-only) + +Runs on tag push to `main`. Produces `gps-denied-jetpack--.img` (the deployable JetPack image) plus a signed checksum. The image is uploaded to the release bucket; the signature is signed by a release key stored in the Tier-1 secret manager. + +### Operator tooling tarball (release-only) + +Bundles `operator-tooling` Docker image + `mock-suite-sat-service` Docker image + their compose file + a verification script + the documentation under `_docs/02_document/`. The tarball is uploaded to the release bucket alongside the JetPack image. + +## Caching Strategy + +| Cache | Key | Restore Keys | +|-------|-----|-------------| +| Python deps (Tier-1) | `pyproject.toml` hash + Python version | Python version only | +| C++ build deps (Tier-1) | `cmake/dependencies.cmake` hash | n/a — full rebuild on change | +| Docker layers (Tier-1) | `Dockerfile` hash + dep-file hashes | Dockerfile hash | +| TRT engine cache (Tier-2) | manifest hash from `_docs/02_document/data_model.md` § 2.4 (`engine_cache_bundle_hash`) | none (engine cache is per-tuple; reuse only on exact tuple match) | +| Tier-1 build artifacts | `git-sha` | branch name | +| Replay fixtures | `tests/fixtures/flight_derkachi/` content hash | n/a | + +## Parallelization + +``` +push → [ lint || unit (parallel per component) ] (Tier-1) + → integration (Tier-1; sequential) + → build matrix [deployment, research] (Tier-1; parallel) + → [ SBOM diff || security ] (Tier-1; parallel) + → push images (Tier-1; merge events only) + → [ Tier-2 build || Tier-1 release prep (on tag) ] (parallel) + → AC-bound NFTs (Tier-2; on merge events; sequential per scenario, parallel where the AC allows) + → release (on tag; sequential) +``` + +Tier-1 stages from `lint` through `push images` typically complete in ≤ 12 min; Tier-2 NFTs take 1–4 h depending on the replay corpus length and the active scenario set. + +## Notifications + +| Event | Channel | Recipients | +|-------|---------|-----------| +| Build failure (Tier-1) | Slack `#gps-denied-ci` | onboard team | +| Tier-2 NFT failure | Slack `#gps-denied-ci` + email | onboard team + safety reviewer | +| Security alert (CVE block) | Slack `#gps-denied-ci` + email | onboard team + suite security | +| SBOM diff fail (ADR-002) | Slack `#gps-denied-ci` + PR comment | PR author | +| Deploy success (release) | Slack `#gps-denied-releases` | suite-wide | +| JetPack image signature mismatch | Slack `#gps-denied-ci` + email + page | release engineer + safety reviewer | + +## Manual-trigger override + +Initially, AC-bound NFTs may run on manual trigger only while the Tier-2 runner is being provisioned and the test fixtures are being authored. Until that gating is removed, the merge gate on `dev` excludes Tier-2; `stage` and `main` retain the full gate. The exception is documented in `_docs/02_document/deployment/deployment_procedures.md` § Tier-2 enablement. + +## Reference: Woodpecker CI two-workflow contract + +The parent suite uses Woodpecker for some sibling components. If the project decides to migrate from GitHub Actions to Woodpecker, the canonical contract from `.cursor/skills/deploy/templates/ci_cd_pipeline.md` § Reference Implementation applies (`.woodpecker/01-test.yml` + `.woodpecker/02-build-push.yml`, multi-arch matrix). Migration is an explicit decision, NOT current state — current pipeline is GitHub Actions plus a self-hosted Jetson runner. diff --git a/_docs/02_document/deployment/containerization.md b/_docs/02_document/deployment/containerization.md new file mode 100644 index 0000000..d2ab4e8 --- /dev/null +++ b/_docs/02_document/deployment/containerization.md @@ -0,0 +1,245 @@ +# GPS-Denied Onboard — Containerization + +> Date: 2026-05-09 (Plan Phase 2c — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 3 (Deployment Model); `_docs/00_problem/restrictions.md` § Onboard Hardware; ADR-002 (build-time exclusion of unused strategies); ADR-005 (Tier-1 / Tier-2 are first-class). + +## Containerization scope + +This project has **asymmetric containerization** by design (architecture.md § 3, ADR-005): + +- **Tier-1** (workstation): Docker is the universal runtime. Dev, lint, unit, most integration, and `mock-suite-sat-service` all run in Docker compose. +- **Tier-2 (Jetson)**: **NO Docker**. The deployed JetPack image runs the deployment binary natively. TensorRT INT8 calibration caches and `jetson-stats` thermal telemetry are most reliable without a container layer (D-C7-9 + D-C10-6). The "image" is a JetPack 6.2 system image with the deployment binary preinstalled. +- **Operator workstation**: Docker is used for the local `satellite-provider` mirror, the `mock-suite-sat-service` (when offline), and the operator-tooling stack (C11 Tile Manager + C12 Operator Pre-flight Tooling). + +Three Dockerfiles are maintained; the airborne companion uses **none of them** in production. + +## Component Dockerfiles + +### `gps-denied-companion-tier1` (Tier-1 dev / CI only) + +This image is for fast iterative development on a workstation. It is **never** flashed onto a Jetson. + +| Property | Value | +|----------|-------| +| Base image | `nvidia/cuda:12.6.0-runtime-ubuntu22.04` (or `python:3.10-slim` if no GPU on dev box) | +| Build image | `nvidia/cuda:12.6.0-devel-ubuntu22.04` | +| Stages | `system-deps` → `python-deps` → `cpp-build` (CMake + GTSAM + FAISS + OpenCV + OKVIS2 + KltRansac) → `runtime` | +| User | `companion` (UID 1000, non-root) | +| Health check | `python -m gps_denied.healthcheck` (validates calibration JSON loadable + DB reachable + FAISS index mmap-able). 30 s interval. | +| Exposed ports | `5101/tcp` (companion control plane — Tier-1 only; Tier-2 production has no inbound network) | +| Key build args | `BUILD_VINS_MONO=OFF` (deployment build), `BUILD_SALAD=OFF`; `BUILD_VINS_MONO=ON BUILD_SALAD=ON` for the research build | +| Notes | Two distinct image tags built on every PR: `companion-tier1:deployment-` and `companion-tier1:research-` (ADR-002). | + +### `mock-suite-sat-service` (Tier-1 e2e-test fixture; ADR-007 reversed 2026-05-09 — fixture only, not a component) + +e2e-test fixture only — implements the planned D-PROJ-2 ingest contract (`POST /api/satellite/tiles/ingest`) so upload integration tests can run before the real endpoint ships service-side. Production never reaches it; the architectural counterparty for upload is the real `satellite-provider`. Download integration tests target the real `satellite-provider` directly (its GET surface is already implemented), not this fixture. Source lives under `tests/fixtures/mock-suite-sat-service/`, NOT `src/components/`. + +| Property | Value | +|----------|-------| +| Base image | `mcr.microsoft.com/dotnet/aspnet:8.0-alpine` (matches the parent suite's stack) | +| Build image | `mcr.microsoft.com/dotnet/sdk:8.0-alpine` | +| Stages | `restore` → `build` → `publish` → `runtime` | +| User | `mock` (non-root) | +| Health check | HTTP `GET /healthz` (returns 200 if listening + storage backend mounted). 10 s interval. | +| Exposed ports | `5100/tcp` (matches `satellite-provider`'s port so the same client config works) | +| Key build args | `MOCK_FAILURE_PROFILE` (default `none`; used by NFT-SEC-01 to inject latency / 5xx / partial responses) | +| Notes | The mock is a release artifact (operator-tooling tarball includes its compose file). When the real `satellite-provider` D-PROJ-2 endpoint ships, the mock is retired. | + +### `operator-tooling` (Operator workstation Tile Manager + pre-flight UI, C11 + C12) + +| Property | Value | +|----------|-------| +| Base image | `python:3.10-slim` | +| Build image | `python:3.10-slim` (no native deps; pure Python plus `httpx` for both download and upload, `psycopg` for read/write of C6 mirror, `cryptography` for upload signing) | +| Stages | `python-deps` → `runtime` | +| User | `operator` (non-root) | +| Health check | `python -m operator_tooling.healthcheck` (validates `satellite-provider` reachable). 30 s interval. | +| Exposed ports | `8080/tcp` (operator pre-flight UI, C12); no inbound network for C11 Tile Manager (it's a CLI / one-shot tool, both directions) | +| Key build args | `INCLUDE_PRE_FLIGHT_UI=true` (default; can be turned off for headless CLI-only deployments) | +| Notes | **C11 Tile Manager (both `TileDownloader` and `TileUploader`) is in this image, NEVER in `gps-denied-companion-tier1`** (ADR-004 process-level isolation). The airborne deployment binary on Tier-2 also does not contain C11. | + +## Docker Compose — Local Development + +```yaml +# docker-compose.yml +services: + companion: + build: + context: . + dockerfile: docker/companion-tier1.Dockerfile + args: + BUILD_VINS_MONO: "OFF" + BUILD_SALAD: "OFF" + image: gps-denied/companion-tier1:dev + environment: + - DB_URL=postgresql://gps_denied:dev@db:5432/gps_denied + - SATELLITE_PROVIDER_URL=http://mock-sat:5100 + - CAMERA_CALIBRATION_PATH=/fixtures/calibration/adti26.json + - LOG_LEVEL=DEBUG + - GPS_DENIED_FC_PROFILE=ardupilot_plane + volumes: + - ./tests/fixtures:/fixtures:ro + - tile-cache:/var/lib/gps-denied/tiles + - fdr:/var/lib/gps-denied/fdr + depends_on: + db: { condition: service_healthy } + mock-sat: { condition: service_healthy } + healthcheck: + test: ["CMD", "python", "-m", "gps_denied.healthcheck"] + interval: 30s + timeout: 10s + retries: 3 + networks: [ gps-denied-net ] + + mock-sat: + build: + context: ./mock-suite-sat-service + dockerfile: Dockerfile + image: gps-denied/mock-suite-sat-service:dev + environment: + - ASPNETCORE_URLS=http://+:5100 + - MOCK_FAILURE_PROFILE=none + volumes: + - mock-sat-tiles:/srv/tiles + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://localhost:5100/healthz"] + interval: 10s + networks: [ gps-denied-net ] + + db: + image: postgres:16-alpine + environment: + - POSTGRES_DB=gps_denied + - POSTGRES_USER=gps_denied + - POSTGRES_PASSWORD=dev + volumes: + - db-data:/var/lib/postgresql/data + - ./docker/db-init:/docker-entrypoint-initdb.d:ro + healthcheck: + test: ["CMD", "pg_isready", "-U", "gps_denied"] + interval: 5s + networks: [ gps-denied-net ] + + operator-tooling: + build: + context: . + dockerfile: docker/operator-tooling.Dockerfile + image: gps-denied/operator-tooling:dev + environment: + - SATELLITE_PROVIDER_URL=http://mock-sat:5100 + - COMPANION_DB_URL=postgresql://gps_denied:dev@db:5432/gps_denied + ports: + - "8080:8080" + depends_on: + mock-sat: { condition: service_healthy } + networks: [ gps-denied-net ] + +volumes: + tile-cache: + fdr: + db-data: + mock-sat-tiles: + +networks: + gps-denied-net: +``` + +## Docker Compose — Tier-1 Integration & Blackbox Tests + +```yaml +# docker-compose.test.yml +services: + companion: + extends: + file: docker-compose.yml + service: companion + environment: + - LOG_LEVEL=INFO + - GPS_DENIED_REPLAY_FIXTURE=/fixtures/flight_derkachi + - GPS_DENIED_TIER=1 + + mock-sat: + extends: + file: docker-compose.yml + service: mock-sat + volumes: + - ./tests/fixtures/tiles_corpus:/srv/tiles:ro + + db: + extends: + file: docker-compose.yml + service: db + volumes: + - ./tests/fixtures/seed-db.sql:/docker-entrypoint-initdb.d/01_seed.sql:ro + + e2e-runner: + build: + context: ./e2e + dockerfile: Dockerfile + image: gps-denied/e2e-runner:dev + depends_on: + companion: { condition: service_healthy } + mock-sat: { condition: service_healthy } + db: { condition: service_healthy } + environment: + - PYTEST_ARGS=--csv=/results/report.csv -v + volumes: + - ./e2e/results:/results +``` + +Run: `docker compose -f docker-compose.test.yml up --abort-on-container-exit --exit-code-from e2e-runner --build`. + +## Tier-2 — Jetson runtime (NO Docker) + +The Tier-2 deployment is a **JetPack 6.2 system image**, not a container. Its assembly is documented in `deployment_procedures.md` § Production Deployment. Key constraints driving the no-Docker decision (architecture.md § 3, D-C7-9 + D-C10-6): + +1. **TensorRT INT8 calibration caches**: most reliable when the SM/JetPack/TRT triple matches the host kernel exactly; container-host abstraction is a known source of drift. +2. **`jetson-stats` thermal telemetry**: needs root + sysfs access; runs cleanest on bare metal. +3. **AC-NEW-1 cold-start budget (30 s p95)**: container start adds 1–2 s overhead the budget cannot afford. +4. **AC-NEW-3 FDR storage (≤ 64 GB)**: the FDR ring is mounted on the host's NVM directly; a container layer would either bind-mount (no benefit) or copy (defeats the storage guarantee). + +Tier-2 CI runs the same deployment binary directly on the self-hosted Jetson runner, with no container shim. + +## Image Tagging Strategy + +| Context | Tag Format | Example | +|---------|-----------|---------| +| CI build (deployment binary) | `/gps-denied/companion-tier1:deployment-` | `ghcr.io/azaion/gps-denied/companion-tier1:deployment-a1b2c3d` | +| CI build (research binary) | `/gps-denied/companion-tier1:research-` | `ghcr.io/azaion/gps-denied/companion-tier1:research-a1b2c3d` | +| Mock sat service | `/gps-denied/mock-suite-sat-service:` | `ghcr.io/azaion/gps-denied/mock-suite-sat-service:a1b2c3d` | +| Operator tooling | `/gps-denied/operator-tooling:` | `ghcr.io/azaion/gps-denied/operator-tooling:a1b2c3d` | +| Release | `/gps-denied/:` | `ghcr.io/azaion/gps-denied/companion-tier1:deployment-1.2.0` | +| Local dev | `gps-denied/:dev` | `gps-denied/companion-tier1:dev` | +| JetPack image (Tier-2) | `gps-denied-jetpack--.img` | `gps-denied-jetpack-1.2.0-a1b2c3d.img` (file artifact, not a container tag) | + +## SBOM and binary track + +CI emits both Tier-1 binary tracks on every PR (ADR-002). After build, an SBOM diff step asserts: + +- The deployment-binary SBOM **must NOT** include `vins_mono`, `salad`, or any other research-only library. +- The research-binary SBOM **must** include every strategy listed in the architecture. + +A failing SBOM diff fails the PR. SBOM artifacts are attached to the release; they are NOT shipped on the deployed Jetson image (they live only in the release artifacts directory). + +## .dockerignore + +``` +.git +.cursor +_docs +_standalone +node_modules +**/bin +**/obj +**/__pycache__ +**/.venv +**/venv +**/.pytest_cache +**/.mypy_cache +*.md +.env* +docker-compose*.yml +tests/fixtures/large_replays/ +``` + +The `tests/fixtures/large_replays/` exclusion is critical: that directory holds the Derkachi flight footage (multi-GB) which is mounted into the test runner via `volumes:` rather than baked into images. diff --git a/_docs/02_document/deployment/deployment_procedures.md b/_docs/02_document/deployment/deployment_procedures.md new file mode 100644 index 0000000..53fb01c --- /dev/null +++ b/_docs/02_document/deployment/deployment_procedures.md @@ -0,0 +1,265 @@ +# GPS-Denied Onboard — Deployment Procedures + +> Date: 2026-05-09 (Plan Phase 2c — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 3 (Deployment Model) + § 7 (Security); `_docs/02_document/data_model.md` § 4 (Migration Strategy); environment_strategy.md; ADR-002, ADR-004, ADR-005; AC-NEW-1, AC-NEW-3, AC-NEW-4, AC-NEW-5. + +## Deployment scope and model + +This project does **not** ship a service; it ships an **embedded edge image** plus an **operator-tooling bundle**. The "deployment" patterns from the standard template (blue-green / rolling / canary) are not applicable. Deployment for this project means: + +| Artifact | Target | Deployment mechanism | +|---|---|---| +| **JetPack image** (`gps-denied-jetpack--.img`) | Production Jetson Orin Nano Super on a UAV | Operator flashes the image onto the Jetson via NVIDIA `sdkmanager` or `Etcher`-style `dd` from the operator workstation | +| **Operator tooling tarball** | Operator workstation | Operator extracts; `docker compose up -d` brings up `mock-suite-sat-service` (when offline) + `operator-tooling` | +| **Tier-1 dev compose** | Developer workstation | Developer runs `docker compose up` from repo root | + +**Zero-downtime is not a goal**: a UAV is not in service while it is being re-flashed. The deployment cadence is per-airframe maintenance, not per-request availability. + +**Strategy**: the closest analogue to a "rolling deploy" is the operator's fleet-management process (re-flash one UAV at a time across the fleet). The fleet-management process is the operator's concern, not this project's; this document covers the per-airframe procedure. + +## Pre-deployment artifact assembly (release engineer) + +Performed once per release on Tier-1 + Tier-2 CI; produces signed artifacts stored in the release bucket. + +1. Tag a commit on `main`. CI runs the full pipeline (`ci_cd_pipeline.md`). +2. **Tier-1 produces**: + - `companion-tier1:deployment-` and `companion-tier1:research-` Docker images (pushed to registry). + - `mock-suite-sat-service:` Docker image. + - `operator-tooling:` Docker image. + - SBOM artifacts for both binaries (deployment and research). + - `operator-tooling--.tar.gz` containing the operator-tooling image + mock-sat image + their compose file + verification script + relevant docs. +3. **Tier-2 produces**: + - Native deployment-binary build on the self-hosted Jetson runner. + - SBOM verification: byte-equal (after canonicalization) to Tier-1's deployment-binary SBOM. Mismatch fails the release. + - **JetPack image build**: a JetPack 6.2 base image with the deployment binary + PostgreSQL 16 + base migrations + `/etc/gps-denied/runtime.yaml` template preinstalled. Output: `gps-denied-jetpack--.img`. +4. **Signing** (Tier-1): + - Both Docker image manifests are signed with the project's release key. + - The JetPack image is signed; checksum is published as a separate signed file (`gps-denied-jetpack--.img.sha256.sig`). + - The operator-tooling tarball is signed. +5. **Release bucket**: artifacts uploaded; release notes published; the previous release's artifacts retained for at least 90 days for rollback support. + +A release fails if any step above fails — including any AC-bound NFT failure on Tier-2 (`ci_cd_pipeline.md` § AC-bound NFTs). + +## Pre-takeoff readiness gate ("health check" analog) + +Production has no `/health/live` HTTP endpoint (no listener; NFT-SEC-05). The companion's "health check" is the **pre-takeoff readiness gate**: a sequence of checks that runs at takeoff load and decides whether the companion is ready to emit external position to the FC. + +| Check | What it validates | Action on failure | +|---|---|---| +| Manifest content-hash gate (D-C10-3) | The on-disk manifest matches the operator-staged manifest hash (data_model.md § 2.4) | FDR record `0x000D ContentHashGateFail` + STATUSTEXT critical + companion refuses to publish a `GPS_INPUT` / `MSP2_SENSOR_GPS` source | +| Camera calibration JSON validation | File present + schema-valid + content-hash matches `manifests.calibration_artifact_hash` | Same | +| FAISS `.index` mmap + content-hash | mmap succeeds + content-hash matches `manifests.descriptor_index_hash` | Same | +| TRT engine cache verification | All required engines present per `engine_cache_entries`; each engine's content-hash matches `engine_hash` | Same | +| `alembic current == head` | DB schema is up-to-date for this binary | Same | +| MAVLink-2.0 signing handshake (AP profile) | Signed handshake with the FC succeeds within AC-NEW-1 30 s budget (D-C8-9 = (d)) | FDR record `MavlinkSigningKeyRotated` with reason "handshake_failed" + STATUSTEXT critical + companion refuses to emit | +| Per-flight key generation | Both per-flight ephemeral keys (MAVLink signing + onboard tile signing) generated and persisted under `/var/lib/gps-denied/per-flight/` | Same | +| Initial frame → emit pipeline test | First nav-camera frame reaches C8 outbound encoder; `EmittedExternalPosition` produced | Same | +| Network egress is denied | Verify no outbound network egress is possible (DNS blackhole effective, iptables OUTPUT REJECT loaded) — defense-in-depth on architecture.md § 7 + NFT-SEC-05 | FDR critical + STATUSTEXT + refuse to emit | + +The gate completes within the AC-NEW-1 30 s p95 budget; failure produces a clear FDR + STATUSTEXT trail and the companion's `GPS_INPUT` / `MSP2_SENSOR_GPS` channel stays silent — the FC operates as if no companion-GPS source is available, which is the correct safe-default. + +## Production deployment procedure (per-airframe) + +This is the per-airframe deployment procedure performed by the operator, NOT by CI. + +### 1. Pre-deploy approval + +Required before any production-bound flight: + +- [ ] Release notes for the target version reviewed; AC-NEW-4 / AC-NEW-7 statistical summaries reviewed. +- [ ] All Tier-2 AC-bound NFTs green at the target version (`ci_cd_pipeline.md` § AC-bound NFTs). +- [ ] Security audit of the target version completed (Tier-1 SBOM clean of unpatched CVEs; D-CROSS-CVE-1). +- [ ] D-PROJ-1 calibration step performed on the target Jetson + UAV pairing (hybrid factory + checkerboard-refined; ~1 day per deployed unit). +- [ ] Rollback artifact (the previous release's JetPack image) is staged on the operator workstation. +- [ ] FDR retention policy for this airframe confirmed (default 30 days; environment_strategy.md § Database Management). +- [ ] If switching FC profile (`ardupilot_plane` ↔ `inav`), FC firmware compatibility confirmed. + +### 2. Pre-deploy checks (operator workstation) + +```sh +# Verify the artifact bundle integrity. +cosign verify-blob \ + --signature gps-denied-jetpack--.img.sha256.sig \ + --key gps-denied-release-key.pub \ + gps-denied-jetpack--.img.sha256 + +sha256sum -c gps-denied-jetpack--.img.sha256 + +# Verify the operator-tooling tarball. +cosign verify-blob \ + --signature operator-tooling--.tar.gz.sig \ + --key gps-denied-release-key.pub \ + operator-tooling--.tar.gz +``` + +### 3. Pre-flight cache build (operator-tooling C12) + +Performed on the operator workstation, with `satellite-provider` reachable (locally mirrored or via lab VPN). + +```sh +docker compose -f operator-tooling-compose.yml up -d +# Operator opens http://127.0.0.1:8080 +``` + +The C12 UI walks the operator through: + +1. Upload / select the target operational sector (GeoJSON polygon). +2. Set sector classifications (`active_conflict` ↔ `stable_rear`) — drives freshness threshold (data_model.md § 2.3). +3. Tile download from `satellite-provider` (parent suite) — produces `tiles` rows with `source='googlemaps'` + filesystem JPEGs. +4. Descriptor (FAISS) index generation across the loaded tile corpus. +5. TRT engine compilation on the workstation (Tier-2 emulation if no Jetson is present, or directly on a co-located Jetson dev kit). +6. Manifest generation: hash over (model bundle + calibration JSON + corpus + sector classifications + descriptor index + engine cache). +7. Output: a sealed pre-flight bundle on a USB drive or staged for direct ethernet transfer. + +### 4. JetPack image flash + +Operator flashes the target JetPack image onto the Jetson: + +```sh +sudo dd if=gps-denied-jetpack--.img of=/dev/sdX bs=4M status=progress +# OR via NVIDIA SDK Manager for a more guided flow. +sync +``` + +The flashed image contains: + +- JetPack 6.2 base +- The deployment binary preinstalled at `/opt/gps-denied/` +- PostgreSQL 16 with `alembic` schema initialized at the target migration head +- `/etc/gps-denied/runtime.yaml` template (the operator fills in airframe-specific values: `fc_profile`, `companion_id`) +- A systemd unit `gps-denied.service` that auto-starts at boot + +The image is **identical across UAVs**; per-airframe configuration (`/etc/gps-denied/runtime.yaml`) is filled in after flash. + +### 5. Per-airframe configuration + +Operator boots the Jetson in maintenance mode, ssh's in (this is the only time the Jetson has any inbound network surface; closed before takeoff), and: + +```sh +sudo $EDITOR /etc/gps-denied/runtime.yaml +# Set: fc_profile, companion_id, fdr_retention_days, log_level +sudo gps-denied-cli stage-cache /mnt/usb/gps-denied-cache-.tar.gz +# Stages the operator-prepared cache + calibration + manifest into /var/lib/gps-denied/. +sudo gps-denied-cli verify-readiness +# Runs all gate checks except MAVLink signing handshake (which requires the FC to be powered). +``` + +### 6. UAV integration + +- Wire the Jetson UART/USB to the FC. +- For ArduPilot Plane: configure FC parameters per the AP-side checklist (`EKF3_SRC1_POSXY = 3` or per D-C8-2 = (b) configuration, AHRS_EKF_TYPE = 3). +- For iNav: configure `gps_provider = MSP`, `gps_ublox_use_galileo = OFF`. +- Power up the FC; verify MAVLink signing handshake completes within 30 s (AC-NEW-1). + +### 7. First-flight commissioning + +The first flight on a freshly-deployed airframe is a **commissioning flight**, not a production flight: + +- Operator stays in line-of-sight. +- AC-5.2 fallback (FC IMU-only) is the primary safety net during commissioning. +- Operator manually triggers a `MAV_CMD_REQUEST_MESSAGE` to confirm `GPS_INPUT` is being received and the FC's EKF source-set switch responds correctly. +- If everything looks healthy on the GCS dashboard for 5+ minutes of cruise, the airframe is cleared for production flights. + +### 8. Post-deploy monitoring + +Post first commissioning flight: + +- [ ] FDR retrieved and visualized on operator workstation (operator-tooling C12 dashboard, observability.md § 5.1). +- [ ] AC-NEW-4 statistics for the commissioning flight reviewed; outliers investigated. +- [ ] No FDR segment drops; no `ContentHashGateFail` events. +- [ ] Mid-flight tile generation working (post-landing upload — handle that separately). +- [ ] If everything green, the deployment is finalised; the previous release's JetPack image can be archived (still kept for rollback). + +## Post-landing tile upload (per-flight, ADR-004) + +Per AC-8.4 + ADR-004, mid-flight tile upload to `satellite-provider` is **post-landing only**, and uses the operator-tooling's C11 Tile Manager (`TileUploader` interface; a separate binary, never linked into the airborne image). + +```sh +# Operator plugs the companion's NVM into the workstation OR ssh's into the powered-off-then-re-booted Jetson. +docker compose run operator-tooling \ + python -m operator_tooling.tilemanager upload \ + --flight-id \ + --satellite-provider $SATELLITE_PROVIDER_URL \ + --signing-pubkey-fingerprint +``` + +Behavior: + +- Reads the local `tiles` rows where `source='onboard_ingest' AND voting_status='pending' AND flight_id=`. +- Reads the corresponding JPEG body + sidecar JSON from filesystem. +- Reads the per-flight onboard tile-signing private key (still on the companion's NVM until FDR rolls over). +- Submits to `satellite-provider`'s `POST /api/satellite/tiles/ingest` endpoint (D-PROJ-2 contract). +- On 2xx success: deletes local row + JPEG + sidecar + emits FDR event `tile_uploaded`. +- On 4xx: leaves local data; emits FDR event `tile_upload_failed` with reason; operator decides next steps (likely a parent-suite issue). +- On 5xx: retries with exponential backoff; persistent failure → `tile_upload_failed` + operator review. + +When the parent-suite voting layer (D-PROJ-2 design task #2) ships, this flow does NOT change on the onboard side — the parent suite's promotion logic is invisible to onboard-side upload. + +## Rollback Procedures + +### Trigger criteria + +| Severity | Trigger | Decision-maker | +|---|---|---| +| Critical (per-airframe) | Commissioning flight fails AC-5.2 fallback (the FC IMU-only fallback also failed; airframe lost) | Safety review board (out of scope of this project) | +| Critical (fleet-wide) | Any post-deploy AC-NEW-4 outlier indicates a regression: P(err > 1 km) measured on a real flight > AC threshold by ≥ 2x | Suite security + onboard team lead | +| High (per-airframe) | Commissioning flight passes but post-flight FDR analysis shows AC-NEW-4 / AC-NEW-7 regression vs. prior release | Onboard team lead | +| High (per-airframe) | Operator unable to complete pre-flight readiness gate (manifest hash gate fails repeatedly) | Operator + onboard team lead | +| Medium (per-airframe) | Sustained `dead_reckoned` periods longer than expected; FDR segment drops occurring | Operator + onboard team lead (post-flight investigation; may not warrant immediate rollback) | + +### Rollback steps (per-airframe) + +1. **Re-flash** the previous release's JetPack image onto the affected Jetson (same procedure as § 4 with the previous artifact). +2. **Re-stage** the previous release's pre-flight bundle (the operator workstation retains it in the operator-tooling cache for ≥ 30 days). +3. **Re-run** the pre-takeoff readiness gate. +4. **Confirm** AC-5.2 fallback is still functional (it is FC firmware behavior; rolling back the companion image cannot break it, but verify on the GCS). +5. **Document** the rollback in the post-mortem template; include FDR snapshots from the offending flight (if any) plus the rollback artifacts versions. + +### Database rollback (data_model.md § 4.2 reversibility) + +Per data_model.md § 4.2, every Alembic migration MUST implement a working `downgrade()`. Rolling back the JetPack image to the previous release rolls back the schema to whatever migration head the previous release uses. Concretely: + +- The previous release's JetPack image contains its own Alembic migration tree. +- On boot, the previous-release runtime asserts `alembic current == head_for_that_release`. If the database is on a NEWER head (because the airframe ran the new release between deployments), the runtime invokes `alembic downgrade ` automatically. +- If a migration is **not reversible** (which requires an explicit ADR — data_model.md § 4.2), the rollback must be manually adjudicated by the operator + onboard team lead. This case is rare by policy. + +### Post-mortem + +Required after every rollback (per-airframe or fleet-wide): + +- Timeline: when was the new release flashed; when did the failure surface; when was rollback initiated. +- Root cause: which AC was missed; which component is implicated; was it a regression introduced by this release or by a hardware/operational variable change. +- What went wrong in the release process: did Tier-2 CI catch it; if not, why not. +- Prevention: new test scenario added to NFT suite; new lint check; new rule in `_docs/LESSONS.md`. +- Distribution: post-mortem report stored under `_docs/06_metrics/incident__.md` (per autodev failure-handling protocol). + +## Deployment Checklist + +Pre-flash: + +- [ ] All Tier-2 AC-bound NFTs green at target version +- [ ] Security scan clean (zero critical / high CVEs; SBOM diff passes ADR-002 enforcement) +- [ ] Both Docker images built and pushed (deployment + research) +- [ ] JetPack image built, signed, checksummed +- [ ] Operator-tooling tarball built, signed +- [ ] Pre-flight bundle prepared by operator (cache + calibration + manifest) +- [ ] Pre-takeoff readiness gate behavior verified on a bench Jetson before flashing onto the production unit +- [ ] Rollback artifact (previous release JetPack image) staged on operator workstation +- [ ] FDR retention policy confirmed for the target airframe + +Post-flash: + +- [ ] First-flight commissioning flight cleared per § 7 +- [ ] FDR retrieved and analyzed; AC-NEW-4 / AC-NEW-7 statistics within expected envelope +- [ ] Post-landing upload procedure tested end-to-end (companion → operator workstation → `satellite-provider`) +- [ ] Operator runbook updated with airframe-specific notes (e.g., "this airframe has UART2 wired to FC") + +## Tier-2 enablement + +Until the Tier-2 self-hosted Jetson runner is fully provisioned: + +- AC-bound NFTs are gated as **manual trigger only** on PRs (`ci_cd_pipeline.md` § Manual-trigger override). +- The merge gate on `dev` excludes Tier-2 NFTs; the merge gate on `stage` and `main` retains the full gate. +- The pre-takeoff readiness gate (§ Pre-takeoff readiness gate) is unaffected — it runs on the Jetson at every takeoff regardless of CI gating posture. + +When the Tier-2 runner is in steady state, this section is removed and the merge gates harmonize across `dev` / `stage` / `main`. diff --git a/_docs/02_document/deployment/environment_strategy.md b/_docs/02_document/deployment/environment_strategy.md new file mode 100644 index 0000000..ed62b56 --- /dev/null +++ b/_docs/02_document/deployment/environment_strategy.md @@ -0,0 +1,178 @@ +# GPS-Denied Onboard — Environment Strategy + +> Date: 2026-05-09 (Plan Phase 2c — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 3 (Deployment Model) + § 7 (Security Architecture); `_docs/02_document/data_model.md` § 5 (Seed Data); `_docs/00_problem/restrictions.md`; ADR-002, ADR-004, ADR-005. + +## Environments + +This project has **six environments**, not the canonical three (dev / staging / prod). The asymmetry reflects ADR-005 (Tier-1 / Tier-2) and ADR-004 (process-level isolation between airborne companion image and operator-side upload tool). + +| Environment | Purpose | Infrastructure | Data Source | +|-------------|---------|---------------|-------------| +| `dev-tier1` | Local developer iteration; lint + unit + most integration tests | Workstation (Linux x86_64; NVIDIA GPU optional); Docker compose | Test fixtures (`adti26.json` calibration; `tests/fixtures/flight_derkachi/`) + `mock-suite-sat-service` | +| `dev-tier2` | Hardware-bound developer checks | Jetson Orin Nano Super dev kit on developer's desk; bare JetPack | Test fixtures + locally-mirrored `satellite-provider` | +| `staging-tier1` | CI runs that don't require Jetson hardware | GitHub-hosted runner (x86_64); Docker | Sealed test fixtures committed to the repo | +| `staging-tier2` | CI runs that require Jetson (AC-bound NFT-PERF-*, NFT-LIM-*, NFT-RES-*, NFT-SEC-*, IT-12) | Self-hosted Jetson runner; bare JetPack 6.2 | Same sealed fixtures + cached TRT engines per manifest hash | +| `production` | Deployed onboard companion image on a UAV | Jetson Orin Nano Super (pinned); bare JetPack 6.2; **no inbound network listening; no outbound network egress in flight** (NFT-SEC-05) | Operator-staged pre-flight cache + per-flight in-flight orthorectified tiles | +| `production-operator-workstation` | Pre-flight tile download (C11 `TileDownloader`); pre-flight cache artifact build (C10 driven by C12); post-landing tile upload (C11 `TileUploader`); FDR retrieval | Operator's Linux workstation; Docker for `satellite-provider` mirror | Operator-managed `satellite-provider` instance + the companion's NVM contents post-landing | + +Notes: + +- **No "staging" deployment of the companion**. Staging is purely a CI mode — there is no live staging Jetson UAV. Production is one-step from CI release artifacts → operator workstation → flashed Jetson. +- **The airborne companion never sees `staging-*` environments at runtime**. Staging is exclusively a CI gating concept. +- **The operator workstation is its own environment** with its own secrets posture (operator login + workstation hardening) — see § Secrets Management. + +## Environment Variables + +Variables are categorized by which environment(s) consume them. Production has the **shortest** required list because in-flight network egress is forbidden — most of the typical "service URL" variables disappear. + +### Required variables — companion runtime (all environments) + +| Variable | Purpose | dev-tier1 default | dev-tier2 default | production source | +|---|---|---|---|---| +| `DB_URL` | Local PostgreSQL connection | `postgresql://gps_denied:dev@db:5432/gps_denied` | `postgresql://gps_denied:dev@localhost:5432/gps_denied` | `postgresql://gps_denied@/gps_denied?host=/var/run/postgresql` (UNIX socket on Jetson, no password) | +| `CAMERA_CALIBRATION_PATH` | Camera calibration JSON path (Principle #1, data_model.md § 2.6) | `/fixtures/calibration/adti26.json` | `/fixtures/calibration/adti26.json` | `/etc/gps-denied/calibration/adti20.json` (per-deployed-unit, post D-PROJ-1 hybrid) | +| `GPS_DENIED_FC_PROFILE` | `ardupilot_plane` or `inav` | `ardupilot_plane` | per developer's bench setup | per UAV airframe (set via JetPack image's `/etc/gps-denied/runtime.yaml`) | +| `GPS_DENIED_VIO_STRATEGY` | `okvis2`, `vins_mono`, `klt_ransac` (ADR-001 startup-locked) | `okvis2` | `okvis2` | `okvis2` (production-default; pending IT-12 verdict) | +| `GPS_DENIED_VPR_STRATEGY` | `ultra_vpr`, `mega_loc`, `mix_vpr`, ... | `ultra_vpr` | `ultra_vpr` | `ultra_vpr` (Documentary Lead PRIMARY) | +| `GPS_DENIED_BUILD_KIND` | `deployment` or `research` (ADR-002; matches the binary's CMake flag set; the runtime validator fails fast if config asks for a strategy not linked into the binary) | `deployment` | `deployment` | `deployment` (research binary is dev-tier2 / staging-tier2 only) | +| `GPS_DENIED_FDR_RETENTION_DAYS` | FDR ring retention (data_model.md § 2.8) | `7` | `30` | `30` (operator-configurable per UAV) | +| `LOG_LEVEL` | `DEBUG` / `INFO` / `WARN` / `ERROR` | `DEBUG` | `INFO` | `INFO` (DEBUG is forbidden on the airborne image — context: no operator-readable console, and DEBUG output on FDR ring would inflate beyond 64 GB AC-NEW-3 envelope) | +| `MAVLINK_SIGNING_KEY_PATH` | Per-flight MAVLink-2.0 signing key file (regenerated at takeoff load; see § Secrets Management) | `/fixtures/keys/dev_mavlink_signing.key` | `/fixtures/keys/dev_mavlink_signing.key` | `/var/lib/gps-denied/per-flight/mavlink_signing.key` (generated at takeoff, deleted on flight ring rollover) | +| `ONBOARD_TILE_SIGNING_KEY_PATH` | Per-flight onboard tile-signing private key | `/fixtures/keys/dev_onboard_signing.key` | `/fixtures/keys/dev_onboard_signing.key` | `/var/lib/gps-denied/per-flight/onboard_tile_signing.key` (generated at takeoff, deleted on flight ring rollover) | + +### Required variables — Tier-1 / staging only (NOT on production) + +| Variable | Purpose | dev-tier1 default | staging-tier1 default | production | +|---|---|---|---|---| +| `SATELLITE_PROVIDER_URL` | Where to reach the tile source for pre-flight runs (CI / dev) | `http://mock-sat:5100` | `http://mock-sat:5100` | **NOT SET** — production never reaches a satellite-provider directly while airborne | +| `MOCK_FAILURE_PROFILE` | Failure injection for `mock-suite-sat-service` | `none` | per CI scenario | n/a | +| `GPS_DENIED_REPLAY_FIXTURE` | Path to replay corpus | `/fixtures/flight_derkachi` | `/fixtures/flight_derkachi` | n/a | + +### Required variables — operator workstation + +| Variable | Purpose | Source | +|---|---|---| +| `SATELLITE_PROVIDER_URL` | Operator's local mirror or VPN-reached lab service | Operator config (operator workstation `.env` file) | +| `SATELLITE_PROVIDER_API_KEY` | TLS + service-internal API key for `satellite-provider` (architecture.md § 7) | Operator workstation secret manager (file or system keyring) — NEVER copied onto the companion image | +| `COMPANION_DB_URL` | Direct DB connection to the companion (post-landing) | Set transiently when the operator plugs the companion in for FDR retrieval / upload | +| `OPERATOR_TOOLING_BIND_ADDR` | Pre-flight UI bind address (C12) | `127.0.0.1:8080` (workstation-local; never exposed to network) | + +### `.env.example` + +Two example files are committed: + +`.env.example.dev-tier1`: + +```env +# dev-tier1 - workstation Docker compose +DB_URL=postgresql://gps_denied:dev@db:5432/gps_denied +SATELLITE_PROVIDER_URL=http://mock-sat:5100 +CAMERA_CALIBRATION_PATH=/fixtures/calibration/adti26.json +GPS_DENIED_FC_PROFILE=ardupilot_plane +GPS_DENIED_VIO_STRATEGY=okvis2 +GPS_DENIED_VPR_STRATEGY=ultra_vpr +GPS_DENIED_BUILD_KIND=deployment +GPS_DENIED_FDR_RETENTION_DAYS=7 +GPS_DENIED_REPLAY_FIXTURE=/fixtures/flight_derkachi +LOG_LEVEL=DEBUG +MAVLINK_SIGNING_KEY_PATH=/fixtures/keys/dev_mavlink_signing.key +ONBOARD_TILE_SIGNING_KEY_PATH=/fixtures/keys/dev_onboard_signing.key +MOCK_FAILURE_PROFILE=none +``` + +`.env.example.operator-workstation`: + +```env +# operator workstation +SATELLITE_PROVIDER_URL=http://localhost:5100 # local mirror, or replace with lab VPN URL +SATELLITE_PROVIDER_API_KEY= # populate from the workstation secret manager; NEVER commit +COMPANION_DB_URL= # set when companion is plugged in for FDR retrieval +OPERATOR_TOOLING_BIND_ADDR=127.0.0.1:8080 +``` + +### Variable validation + +The runtime composition root (`src/composition/runtime_root.py`, ADR-009) validates every required variable at startup and fails fast with a clear error message. Specifically: + +- **Type validation** for enums (`GPS_DENIED_FC_PROFILE`, `GPS_DENIED_VIO_STRATEGY`, etc.) against the strategies linked into the binary (ADR-002 enforcement at config layer). +- **Path validation** for every `*_PATH` variable: file must exist + (where applicable) content-hash must match `manifests` table entry. +- **Forbidden-pair validation**: `GPS_DENIED_BUILD_KIND=deployment` AND `GPS_DENIED_VIO_STRATEGY=vins_mono` is rejected at startup ("vins_mono is not linked into the deployment binary"). The same check is repeated for any research-only strategy. +- **Production hardening**: when `LOG_LEVEL=DEBUG` is set on a binary built with `GPS_DENIED_BUILD_KIND=deployment` AND a manifest indicates a production deployment, the runtime emits a warning and downgrades to `INFO`. A flag `GPS_DENIED_ALLOW_DEBUG_IN_PROD=1` is required to override (only set when an engineer is debugging a returned-from-flight unit on the bench). + +## Secrets Management + +The threat model (architecture.md § 7) treats the airborne companion as a **remote untrusted endpoint**: a downed UAV's companion can be physically captured. Persistent secrets must therefore be **per-flight ephemeral** wherever feasible. + +| Environment | Mechanism | Tool | +|-------------|--------|------| +| `dev-tier1` | `.env` file (git-ignored) + dev keys (committed test fixtures, clearly marked) | dotenv | +| `dev-tier2` | `.env` file (git-ignored) + dev keys | dotenv | +| `staging-tier1` | GitHub Actions secrets | GitHub-managed | +| `staging-tier2` | GitHub Actions secrets injected onto the self-hosted Jetson runner | GitHub-managed | +| `production` (companion) | **Per-flight ephemeral keys** generated at takeoff load by the takeoff bring-up sequence (C8 signing handshake + per-flight tile signing key seed); written to `/var/lib/gps-denied/per-flight/`; logged to FDR; deleted on flight-ring rollover (≥ 30 days post-landing default) | Local filesystem; no external secret manager | +| `production-operator-workstation` | OS-level secret store (keyring / GNOME secrets / macOS keychain) for the long-lived `SATELLITE_PROVIDER_API_KEY` | OS keyring + workstation hardening | + +### Per-flight key lifecycle (production companion) + +1. **Pre-flight**: operator stages cache + calibration + manifests. NO secrets are baked into the JetPack image — the image is identical across all UAVs the operator deploys. +2. **Takeoff load (F2)**: the takeoff sequence generates two ephemeral keypairs: + - MAVLink-2.0 per-flight signing key (D-C8-9 = (d), driven by C8) — only used on the AP wired channel; iNav has no signing. + - Onboard tile-signing keypair (D-PROJ-2 design task #1 contract) — used to sign every mid-flight tile so the parent suite's planned voting layer can authenticate the source. +3. **In flight**: keys live at `/var/lib/gps-denied/per-flight/*.key` (mode 0600, owned by the runtime UID). The MAVLink signing key fingerprint is logged to FDR record `MavlinkSigningKeyRotated`; the onboard signing pubkey hash is recorded in the `flights` table. +4. **Post-landing**: the operator's C11 `TileUploader` uses the onboard tile-signing private key to assemble the upload payload; it's the only post-flight consumer. +5. **Rollover**: when the FDR ring drops a flight, the per-flight key files for that flight are deleted by the same atomic step. + +### No long-lived secrets on the production companion image + +| Type | Where it lives | +|---|---| +| `SATELLITE_PROVIDER_API_KEY` | Operator workstation only; never on the companion image (architecture.md § 7) | +| Per-flight MAVLink signing key | Generated on companion at takeoff; per-flight ephemeral | +| Per-flight onboard tile-signing key | Generated on companion at takeoff; per-flight ephemeral | +| Production deployment binary signing key | Release-time; lives only in the Tier-1 release secret manager | +| JetPack image signing key | Same as above | + +This means the threat surface on a captured companion reduces to "what is in the FDR for the current flight" plus "the public keys of the upstream signing roots" — the latter is publishable without harm. + +### Rotation policy + +| Secret | Rotation cadence | Procedure | +|---|---|---| +| Per-flight MAVLink signing key | Every flight (per-flight ephemeral) | Automated at takeoff load | +| Per-flight onboard tile-signing key | Every flight (per-flight ephemeral) | Automated at takeoff load | +| `SATELLITE_PROVIDER_API_KEY` | Operator-managed; rotated when an operator workstation is reissued or compromised is suspected | Operator workstation hardening procedure (out of scope of this document; operator-tooling C12 owns it) | +| Production binary signing key | Per release cycle or on suspected compromise | Release engineer rotates; new key fingerprint is published in release notes; verification scripts on the operator workstation pull the latest fingerprint | +| JetPack image signing key | Same as production binary signing key | Same | + +## Database Management + +Each companion has its **own local PostgreSQL 16** instance — no shared upstream database, no cluster, no replication. The data_model.md § 1 makes this explicit: companion DB is per-companion; cross-companion coordination happens via `satellite-provider` post-landing only. + +| Environment | Type | Migrations | Data | +|-------------|------|-----------|------| +| `dev-tier1` | Docker `postgres:16-alpine`, named volume | Applied on container start by an init script; Alembic-managed (data_model.md § 4) | Seed data via `tests/fixtures/seed-db.sql` | +| `dev-tier2` | PostgreSQL 16 native on the Jetson (or via developer-installed deb packages) | Applied via `alembic upgrade head` invoked by the takeoff-load script | Same seed fixtures | +| `staging-tier1` | Docker `postgres:16-alpine` | Applied by the test runner before scenarios start | Sealed fixture rows | +| `staging-tier2` | PostgreSQL 16 on the Jetson runner | Applied by the test runner | Sealed fixture rows + per-scenario synthetic injections (NFT-SEC-01 cache-poisoning Monte Carlo, etc.) | +| `production` | PostgreSQL 16 on the Jetson, native install (part of the JetPack image) | Applied at JetPack image build time by the image builder; companion runtime asserts `alembic current == head` at takeoff load and refuses takeoff on mismatch | Live data only (data_model.md § 5 hard rule: production NEVER seeds) | +| `production-operator-workstation` | Workstation's local `satellite-provider` mirror has its own DB; operator tooling does NOT run a separate DB | Mirror DB is `satellite-provider`'s concern; operator tooling reads it but does not migrate it | Mirror data | + +### Migration rules (data_model.md § 4 + § 6) + +- All migrations must be **additive-only by default** (data_model.md § 6.1). +- All migrations must be **reversible by default** (data_model.md § 4.2). Non-reversible migrations require an ADR + user sign-off. +- The `tiles` schema specifically has its **canonical columns frozen** (data_model.md § 6.3) — coordinate any change with `satellite-provider`'s schema owner. +- Production migrations are applied at JetPack image build time, not at runtime. The companion never invokes `alembic upgrade` against a live database in flight; it only verifies `alembic current == head`. +- Migration scripts are reviewed in the same PR that adds the schema change; a PR-level checklist line in the PR template references this rule. + +## Configuration Loading Order + +Composition root (`src/composition/runtime_root.py`) loads configuration in this strict order — later sources override earlier ones: + +1. `_docs/02_document/runtime_config_defaults.yaml` (project-wide defaults; committed) +2. `/etc/gps-denied/runtime.yaml` (per-airframe overrides; baked into the JetPack image) +3. Environment variables (highest precedence on production; second-highest in dev where the next item exists) +4. `--config-override KEY=VALUE` CLI flags (developer convenience; rejected on production by the manifest validator) + +The full resolved configuration is logged to FDR as a `ComponentLifecycleEvent` of type `runtime_config_resolved` at takeoff load — this is the audit record for "what config did this flight actually run with". diff --git a/_docs/02_document/deployment/observability.md b/_docs/02_document/deployment/observability.md new file mode 100644 index 0000000..94db9a7 --- /dev/null +++ b/_docs/02_document/deployment/observability.md @@ -0,0 +1,232 @@ +# GPS-Denied Onboard — Observability + +> Date: 2026-05-09 (Plan Phase 2c — initial draft). +> Inputs: `_docs/02_document/architecture.md` § 7 (Audit logging) + § 6 (NFRs); `_docs/02_document/data_model.md` § 2.8 (FDR); ADR-005 (Tier-1 / Tier-2); AC-NEW-3 (FDR ≤ 64 GB / no silent drops); AC-NEW-5 (operating envelope). + +## Observability is asymmetric by design + +Most CI/CD templates assume a network-connected service that pushes structured logs to an aggregator and exposes Prometheus metrics for live scraping. **This project's airborne profile does not.** Architecture.md ADR-004 + § 7 + Principle #4 require **no inbound network listening and no outbound network egress in flight** (NFT-SEC-05 enforces). The Jetson is operating as an embedded edge device, not a service. + +Observability therefore splits into three regimes: + +| Regime | Where | Live or post-flight | Primary mechanism | +|---|---|---|---| +| **In-flight onboard** | Production Jetson, in flight | Live (to FDR ring) + best-effort live (to GCS) | FDR binary record stream + GCS STATUSTEXT / NAMED_VALUE_FLOAT | +| **Post-flight onboard** | Operator workstation after pulling the FDR | Post-flight | FDR replay + visualization in operator-tooling C12 | +| **CI / dev (Tier-1, Tier-2)** | Workstation Docker / Jetson CI runner | Live | Standard structured logging + Prometheus metrics endpoint where applicable | + +The sections below are organized by regime. + +## 1. In-flight onboard (production Jetson) + +### 1.1 FDR (Flight Data Recorder) — primary observability sink + +Schema is in `data_model.md` § 2.8. Every observable event in flight goes through FDR. The FDR is **append-only**, **lossy on overrun (logged, never silent)**, and **per-flight ring-bounded at ≤ 64 GB** (AC-NEW-3). + +Observability events that emit FDR records: + +| Component | Event | FDR record type | +|---|---|---| +| C8 outbound | Every emitted `EmittedExternalPosition` to FC | `0x0001 EmittedExternalPosition` | +| C8 inbound | Every received MAVLink frame (raw `tlog`-style) | `0x0003 ReceivedMavlinkRaw` | +| C8 inbound (iNav) | Every received MSP2 frame | `0x0004 ReceivedMsp2Raw` | +| C8 inbound | IMU window forwarded to C1 / C5 | `0x0002 ImuTrace` | +| C5 | Source-label transition (`satellite_anchored` ↔ `visual_propagated` ↔ `dead_reckoned`) | `0x0006 SourceLabelTransition` | +| C5 + C8 | Spoofing-promotion / -rejection event | `0x000C SpoofingPromotionEvent` | +| C5 | VISUAL_BLACKOUT entry / exit (AC-3.5, AC-NEW-8) | `0x000B VisualBlackoutEvent` | +| C6 | Mid-flight tile emit | `0x0007 MidFlightTileEmitted` | +| C6 | Mid-flight tile failure (with thumbnail filename, AC-8.5 forensic exception) | `0x0008 MidFlightTileFailed` | +| C7 (inference) | Thermal-throttle hybrid switch K=3 ↔ K=2 | `0x000E ThermalThrottleHybridSwitch` | +| C8 | MAVLink-2.0 signing key rotation event (D-C8-9) | `0x0009 MavlinkSigningKeyRotated` | +| C8 | EKF source-set switch event (D-C8-2 = (b)) | `0x000A EkfSourceSetCommand` | +| C10 | Pre-flight content-hash gate fail | `0x000D ContentHashGateFail` | +| All components | Lifecycle events (start / stop / fail) | `0x000F ComponentLifecycleEvent` | +| `jetson-stats` collector (driven by C7 or a dedicated thread) | Per-second sample of CPU%, GPU%, temp, throttle flag, RAM, VRAM, NVM remaining | `0x0005 SystemHealth` | + +**Lossy-on-overrun rule (AC-NEW-3 enforcement)**: if the FDR writer cannot keep up (NVM I/O bound), the writer drops the **oldest segment** in the current flight's ring AND emits a `0x000F ComponentLifecycleEvent` of type `fdr_segment_dropped` to the new head segment. A segment drop is a hard observability signal — it appears in the post-flight report and in the GCS STATUSTEXT stream. There is no path that silently discards an event. + +**Format**: length-prefixed binary stream with `record_header` (magic `0x47464452 "GFDR"` + version + type + monotonic_ms) followed by a per-type body and a CRC32. New record types are additive (data_model.md § 6.5). + +**Storage path**: `/var/lib/gps-denied/fdr/{flight_id}/segments/seg_NNNNN.bin`. Thumbnails (AC-8.5) live at `/var/lib/gps-denied/fdr/{flight_id}/thumbnails/`. A flight's `manifest.json` (the FDR-side mirror, distinct from the PostgreSQL `manifests` row) sits at the flight's root and carries the flight metadata snapshot. + +### 1.2 GCS telemetry (best-effort, bandwidth-limited) + +The GCS link is the only outbound channel from the airborne companion (per architecture.md § 7). Bandwidth is bounded (AC-6.1: 1–2 Hz downsampled summary). The companion emits: + +| MAVLink message | Rate | Content | +|---|---|---| +| `STATUSTEXT` | event-driven (only when something changes) | Source label transitions; spoofing-promotion / -rejection; VISUAL_BLACKOUT entry / exit; signing key rotation; FDR segment drop; component start / fail; thermal-throttle hybrid switch | +| `NAMED_VALUE_FLOAT` | 1 Hz | `horiz_accuracy_m`, `vert_accuracy_m`, `vio_health` (frame-quality 0..1), `last_anchor_age_s`, `cpu_pct`, `gpu_pct`, `temp_c` | +| `GPS_RAW_INT` | 1–2 Hz (AC-6.1) | Mirror of the AP `GPS_INPUT` we just emitted, downsampled — gives the operator a live position view in QGC | + +These are **best-effort** — packet loss on the GCS link is treated as normal. The FDR remains the source of truth. + +**STATUSTEXT severity mapping**: + +| FDR event | STATUSTEXT severity | Example text | +|---|---|---| +| Source label → `dead_reckoned` | `MAV_SEVERITY_WARNING` | `"GPS-DENIED: dead-reckoned (last anchor 12.3s ago)"` | +| VISUAL_BLACKOUT entry | `MAV_SEVERITY_NOTICE` | `"GPS-DENIED: VISUAL_BLACKOUT entered (reason=low_features)"` | +| Spoofing rejected | `MAV_SEVERITY_NOTICE` | `"GPS-DENIED: spoofed FC GPS rejected (last visual consistency PASS 0.4s ago)"` | +| Spoofing promoted (10 s + visual gate passed) | `MAV_SEVERITY_INFO` | `"GPS-DENIED: FC GPS promoted to fused source"` | +| FDR segment dropped | `MAV_SEVERITY_WARNING` | `"GPS-DENIED: FDR segment 47 dropped (NVM bound)"` | +| Signing key rotation | `MAV_SEVERITY_INFO` | `"GPS-DENIED: MAVLink signing key rotated"` | +| Component fail | `MAV_SEVERITY_CRITICAL` | `"GPS-DENIED: VIO strategy fault — failover to FC IMU-only (AC-5.2)"` | + +### 1.3 No console logging in flight + +Production deployment binary refuses `LOG_LEVEL=DEBUG` by default (environment_strategy.md § Variable validation). The airborne companion has no operator-readable console — even ERROR-level logs go to journald + FDR rather than stdout. journald retention is 7 days on a rolling buffer (separate from the FDR's per-flight retention). + +### 1.4 In-flight metrics are NOT scraped + +There is no Prometheus endpoint on the production airborne companion. The justification matches § 1.3: there is no scraper to scrape it; metrics are recorded into FDR and visible via NAMED_VALUE_FLOAT only. CI / dev environments DO expose `/metrics` (see § 3 below). + +## 2. Post-flight onboard (operator workstation) + +When the operator plugs the companion in post-landing: + +1. **FDR retrieval** (operator tooling C12 — feature, not in scope of this document's structure but observability-impacting): operator-tooling reads the FDR ring, copies it to the workstation, and seals the in-flight ring. The companion's per-flight ephemeral keys are deleted at this step (environment_strategy.md § Per-flight key lifecycle). +2. **Visualization** (operator tooling C12): the workstation renders: + - Time-series of `horiz_accuracy`, `vert_accuracy`, `last_anchor_age_ms`, source label timeline, thermal-throttle hybrid switches, and CPU / GPU / temp. + - Map view: emitted positions vs. (when available) FC `GLOBAL_POSITION_INT` ground truth. + - Spoofing / VISUAL_BLACKOUT event markers overlaid on the timeline. + - Per-flight summary: total mid-flight tiles emitted, FDR segment drops (if any), AC-NEW-4 / AC-NEW-7 statistics for this flight. +3. **NFT-RES-03 / NFT-SEC-01 corpus contribution**: if the operator opts in, the flight's emitted positions + FC ground truth are added to the AC-NEW-4 / AC-NEW-7 Monte-Carlo corpus for the next CI run. +4. **Forensic thumbnail review** (AC-8.5 exception): failed-tile thumbnails are visible in the operator UI for human review; this is the only image-data review surface. + +## 3. CI / dev environments (Tier-1 / Tier-2) + +Tier-1 dev / staging containers DO expose conventional observability surfaces, because they're being driven by humans and CI orchestrators that need them. The airborne profile of § 1 is the **production-only** profile. + +### 3.1 Logging (Tier-1 / Tier-2) + +Structured JSON to stdout/stderr (consumed by the developer's `docker compose logs` or by CI's log collector): + +```json +{ + "timestamp": "2026-05-09T08:42:11.234Z", + "level": "INFO", + "service": "gps-denied-companion", + "component": "C5", + "flight_id": "", + "monotonic_ms": 12345, + "message": "Source label transition", + "context": { + "from": "satellite_anchored", + "to": "visual_propagated", + "reason": "vpr_no_match" + } +} +``` + +Log levels: + +| Level | Usage | Example | +|-------|-------|---------| +| ERROR | Exceptions; component fault that triggered AC-5.2 fallback | "VIO strategy initialization failed: GTSAM dlopen failed" | +| WARN | Degraded behavior; FDR segment drop; thermal-throttle hybrid switch | "Thermal throttle active; downgrading K=3 → K=2" | +| INFO | Significant lifecycle events; source label transition | "Source label: satellite_anchored → visual_propagated" | +| DEBUG | Per-frame diagnostic — Tier-1 / dev only; production refuses this level (environment_strategy.md § Variable validation) | "MatchResult: 47 inliers, residual=2.3px" | + +**PII / safety-sensitive content**: no GPS coordinates in DEBUG / INFO logs by default. Only `horiz_accuracy` (a scalar) is INFO-loggable; the actual lat/lon is FDR-only. WARN / ERROR log records may include lat/lon when the operator's troubleshooting requires it; in that case the FDR still has the canonical record. + +Log retention: + +| Environment | Destination | Retention | +|-------------|-------------|-----------| +| `dev-tier1` | Docker stdout | Container lifetime | +| `dev-tier2` | journald (Jetson) | 7 days | +| `staging-tier1` (CI) | GitHub Actions log artifact | 30 days (matches CI artifact retention) | +| `staging-tier2` (Jetson CI) | Self-hosted runner journald + uploaded report | 30 days | +| `production` | journald (Jetson) | 7 days, see § 1.3 | + +### 3.2 Metrics (Tier-1 / Tier-2) + +Prometheus-compatible `/metrics` endpoint on `dev-tier1`, `staging-tier1`, `staging-tier2`. **Disabled on `production`** (no listener on the airborne companion, NFT-SEC-05). + +Application metrics: + +| Metric | Type | Description | +|--------|------|-------------| +| `gps_denied_frame_processed_total` | Counter | Total nav frames processed (per `GPS_DENIED_VIO_STRATEGY` label) | +| `gps_denied_frame_emit_latency_seconds` | Histogram | End-to-end frame → emit latency (the AC-4.1 metric) | +| `gps_denied_source_label_total` | Counter | Counter per `satellite_anchored | visual_propagated | dead_reckoned` label | +| `gps_denied_vpr_match_rate` | Gauge | Rolling-1-minute rate of successful VPR matches | +| `gps_denied_thermal_hybrid_active` | Gauge | 0/1 — is the K=2 thermal-throttle hybrid active? (D-CROSS-LATENCY-1) | +| `gps_denied_fdr_segment_drops_total` | Counter | Total FDR segment drops this run (AC-NEW-3 audit) | +| `gps_denied_fdr_size_bytes` | Gauge | Current FDR ring size in bytes (must stay ≤ 64 GB) | +| `gps_denied_signing_key_rotations_total` | Counter | MAVLink signing key rotation count | + +System metrics: standard `process_*`, `python_*` exporters; on Tier-2 also `jetson_stats_*` exposed via `jtop` exporter. + +Business metrics (i.e., AC-derived): + +| Metric | AC | Use | +|--------|------|-------------| +| `gps_denied_horiz_accuracy_m` (gauge, last value) | AC-NEW-4 | Live operator dashboard on operator workstation post-flight; CI threshold checks | +| `gps_denied_cold_start_seconds` | AC-NEW-1 | Set once at takeoff load completion; NFT-PERF-03 reads it | +| `gps_denied_spoofing_promotion_latency_seconds` | AC-NEW-2 | Set on each promotion / rejection event; NFT-PERF-04 reads it | + +Collection interval: 15 s (typical Prometheus default; Tier-2 NFT runs may use 1 s for AC-bound timing). + +### 3.3 Distributed tracing — NOT applicable + +The runtime is a single in-process Python program with no cross-service hops in flight (architecture.md § 5 internal communication is all in-process). Distributed tracing is therefore not applicable to the production runtime. + +The Tier-1 integration setup DOES involve cross-container hops (companion ↔ mock-sat ↔ db ↔ e2e-runner), but those are exercised by the e2e test framework's own log + status capture; OpenTelemetry is not provisioned for this project. If a future cycle introduces a multi-process companion (which ADR-004 explicitly rejected for the airborne profile but might appear on the operator workstation for C11 Tile Manager + C12 Operator Pre-flight Tooling), tracing can be reconsidered then. + +## 4. Alerting (post-flight, not in-flight) + +There is no live in-flight alerting from the airborne companion. The operator's **GCS** is the live human-loop interface (STATUSTEXT severity stream § 1.2). All other alerting is **post-flight**: + +| Source | Severity | Response Time | Conditions | +|----------|---------------|-----------|----------| +| FDR review (operator workstation) | Critical | Same-day human review | FDR segment drop count > 0; component fail event; spoofing-promotion latency > 3 s; AC-NEW-4 outliers (P(err > 1 km) > 0.01 % in this flight's window) | +| FDR review | High | Next-day | AC-NEW-1 cold-start TTFF > 30 s p95 in this flight's window; thermal-throttle hybrid active > 25 % of the flight | +| FDR review | Medium | Within 1 week | Mid-flight tile failure rate > 5 %; high VPR no-match rate; sustained `dead_reckoned` periods > 10 s | +| CI (Tier-2) | Critical | Block PR merge | Any AC-bound NFT failure (architecture.md § 6 NFR list) | +| CI (Tier-1) | Critical | Block PR merge | Build failure; security CVE; SBOM diff fail (ADR-002) | + +Notification channels: + +| Severity | Channel | +|----------|---------| +| Critical (FDR or CI) | Slack `#gps-denied-ops` + email | +| High | Slack `#gps-denied-ops` | +| Medium | Slack `#gps-denied-ops` (digest) | + +There is no PagerDuty / on-call rotation for this project; in-flight failures are handled by the FC's IMU-only fallback (AC-5.2), not by an operations team. + +## 5. Dashboards + +### 5.1 Operator workstation post-flight dashboard + +Built into operator-tooling C12. Per flight: + +- Time series: source label, `horiz_accuracy`, `last_anchor_age_ms`, CPU%, GPU%, temp. +- Event markers: VISUAL_BLACKOUT entries, spoofing events, signing key rotations, thermal hybrid switches. +- Map: emitted track + FC ground truth (when available) + pre-flight cache footprint + mid-flight tile coverage. +- Statistics: per-flight error CDF; AC-NEW-4 contribution; mid-flight tile counts. +- FDR audit table: any `0x000F` lifecycle events of severity ≥ WARN. + +### 5.2 CI dashboard (Tier-2) + +GitHub Actions job summary plus a per-NFT report uploaded as workflow artifact. The summary includes: + +- Pass / fail per NFT scenario. +- For NFT-PERF-*: histogram of latencies + comparison to threshold. +- For NFT-LIM-*: peak memory / FDR size traces. +- For NFT-RES-*: AC-NEW-4 / AC-NEW-7 statistical summary with stated 95 % CI. +- For IT-12: comparative-study summary across all VIO / VPR strategies in the research binary. + +There is no live CI dashboard separate from the GitHub Actions UI; the project is small enough that the per-PR job summary is sufficient. + +### 5.3 No live in-flight dashboard + +Out of scope by design. The GCS is the only live operator surface; all other inspection is post-flight. + +## 6. Open Items / Plan-Phase Carryforward + +- **Long-term FDR archive** (multi-flight statistical headroom): D-PROJ-3 (multi-flight fixture acquisition for AC-NEW-4 / AC-NEW-7) is not pursued this cycle. If pursued in a future cycle, post-flight FDR archives become a corpus contribution path; the operator-tooling FDR-retrieval step would need an explicit "contribute to corpus" toggle. +- **Telemetry-link encryption** beyond MAVLink-2.0 signing: out of scope; addressed by physical link assumptions in the threat model (architecture.md § 7). +- **iNav signing**: still has no equivalent to MAVLink-2.0 signing (Mode B Source #129). Carryforward Plan-phase action: file a feature request upstream; meanwhile observability for iNav-profile flights is the same as AP-profile minus the `MavlinkSigningKeyRotated` records (which are NULL on iNav flights per data_model.md § 2.2). diff --git a/_docs/02_document/diagrams/components.drawio b/_docs/02_document/diagrams/components.drawio new file mode 100644 index 0000000..b448522 --- /dev/null +++ b/_docs/02_document/diagrams/components.drawio @@ -0,0 +1,221 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/_docs/02_document/diagrams/flows/00_index.md b/_docs/02_document/diagrams/flows/00_index.md new file mode 100644 index 0000000..74ff9a3 --- /dev/null +++ b/_docs/02_document/diagrams/flows/00_index.md @@ -0,0 +1,34 @@ +# System Flow Diagrams — Index + +The canonical Mermaid sequence diagrams + flowcharts for every system flow live in the single source of truth `_docs/02_document/system-flows.md`. This index file exists so the `diagrams/` directory has a discoverable per-flow entry point matching the artifact-management convention; **do not duplicate Mermaid bodies here** — keep them in `system-flows.md` so they cannot drift. + +## Per-flow links + +| Flow | Description | Section in `system-flows.md` | +|------|-------------|------------------------------| +| F1 | Pre-flight cache provisioning | `## Flow F1: Pre-flight cache provisioning` | +| F2 | Takeoff load (cold-start TTFF, AC-NEW-1) | `## Flow F2: Takeoff load` | +| F3 | Steady-state per-frame estimation (the F3 hot path) | `## Flow F3: Steady-state per-frame estimation` | +| F4 | Mid-flight tile generation + local cache write | `## Flow F4: Mid-flight tile generation + local cache write` | +| F5 | Visual blackout + spoofed-GPS failsafe (AC-NEW-8) | `## Flow F5: Visual blackout + spoofed-GPS failsafe` | +| F6 | Sharp-turn / disconnected-segment re-localization | `## Flow F6: Sharp-turn / disconnected-segment re-localization` | +| F7 | Spoofing-promotion via EKF source-set switch (AC-NEW-2) | `## Flow F7: Spoofing-promotion via EKF source-set switch` | +| F8 | Companion reboot recovery | `## Flow F8: Companion reboot recovery` | +| F9 | GCS telemetry stream (AC-6.1) | `## Flow F9: GCS telemetry stream` | +| F10 | Post-landing tile upload (D-PROJ-2 contract) | `## Flow F10: Post-landing tile upload` | + +Each section in `system-flows.md` contains: description, preconditions, Mermaid sequence diagram, Mermaid flowchart, data-flow table, error scenarios, and performance expectations. + +## Component-level diagram + +The component-level diagram (showing component boundaries and inter-component edges) is `_docs/02_document/diagrams/components.drawio`. It is the visual companion to the per-component specs under `_docs/02_document/components/`. + +## Conventions reminder + +Per `.cursor/skills/plan/templates/system-flows.md` § Mermaid Diagram Conventions: +- Component-named participants (e.g. `participant C5 as C5 StateEstimator`). +- camelCase node IDs. +- `{Question?}` decisions, `([label])` start/end, `[[label]]` for external systems. +- No inline styling. + +If a new flow Fn is added, insert it in `system-flows.md` AND add a row above. Do NOT create separate flow files unless the project later decides to split `system-flows.md` per flow (in which case this index becomes the table of contents and the Mermaid bodies move). diff --git a/_docs/02_document/epics.md b/_docs/02_document/epics.md new file mode 100644 index 0000000..66bac13 --- /dev/null +++ b/_docs/02_document/epics.md @@ -0,0 +1,2054 @@ +# Work-Item Epics — gps-denied-onboard Plan cycle 1 + +This file is the local epic draft for Plan Step 6. Tracker IDs (`AZ-XXX`) are populated when each epic is created in Jira (project `AZ`). Until then, every entry carries `Tracker: pending`. + +## Conventions + +- **Issue type**: Epic. +- **Epic descriptions are self-contained** per the plan-skill rule: a developer reading only the epic should understand the full context. Each epic has the 14 required sections (system context, problem, scope, architecture notes, interface spec, data flow, dependencies, AC, NFRs, risks, effort, child issues, key constraints, testing strategy). +- **Effort sizing**: T-shirt size + story-points range for the epic; per-task story points (PBI complexity) follow the user rule (1, 2, 3, 5, 8 — no PBI > 5; create 2/3, sometimes 5). +- **Cross-cutting epics** parent exactly one shared implementation task; component epics consuming the concern declare a dependency, never re-implement locally. +- **Dependency rule**: no epic depends on a later one in this index. + +## Index + +| # | Epic ID | Title | Type | Tracker | T-shirt | Story Pts | Depends on | +|---|---------|-------|------|---------|---------|-----------|------------| +| 1 | E-BOOT | Bootstrap & Initial Structure | bootstrap | Tracker: pending | M | 13–21 | — | +| 2 | E-CC-LOG | Cross-Cutting: Structured JSON Logging | cross-cutting | Tracker: pending | S | 5–8 | E-BOOT | +| 3 | E-CC-CONF | Cross-Cutting: Configuration & Composition Root | cross-cutting | Tracker: pending | S | 5–8 | E-BOOT | +| 4 | E-CC-FDR-CLIENT | Cross-Cutting: FDR Producer Client (lock-free queue + record schema) | cross-cutting | Tracker: pending | M | 8–13 | E-BOOT, E-CC-LOG | +| 5 | E-C13 | C13 Flight Data Recorder (writer thread + segments + cap) | component | Tracker: pending | L | 21–34 | E-BOOT, E-CC-LOG, E-CC-CONF, E-CC-FDR-CLIENT | +| 6 | E-C7 | C7 On-Jetson Inference Runtime | component | Tracker: pending | L | 21–34 | E-BOOT, E-CC-CONF, E-CC-FDR-CLIENT | +| 7 | E-C6 | C6 Tile Cache + Spatial Index | component | Tracker: pending | M | 13–21 | E-BOOT, E-CC-LOG, E-CC-CONF | +| 8 | E-C11 | C11 Tile Manager (TileDownloader + TileUploader) | component | Tracker: pending | M | 13–21 | E-C6, E-CC-CONF, E-CC-LOG | +| 9 | E-C10 | C10 Pre-flight Cache Provisioning | component | Tracker: pending | M | 13–21 | E-C6, E-C7, E-CC-LOG | +| 10 | E-C12 | C12 Operator Pre-flight Tooling | component | Tracker: pending | M | 13–21 | E-C10, E-C11, E-CC-LOG | +| 11 | E-C1 | C1 Visual / Visual-Inertial Odometry | component | Tracker: pending | XL | 34–55 | E-BOOT, E-CC-FDR-CLIENT, E-C7 | +| 12 | E-C2 | C2 Visual Place Recognition | component | Tracker: pending | L | 21–34 | E-C6, E-C7, E-CC-FDR-CLIENT | +| 13 | E-C2.5 | C2.5 Inlier-based Re-rank | component | Tracker: pending | S | 5–8 | E-C2, E-C7, E-C6 (LightGlue helper shared with C3) | +| 14 | E-C3 | C3 Cross-Domain Matcher | component | Tracker: pending | L | 21–34 | E-C2.5, E-C7 | +| 15 | E-C3.5 | C3.5 AdHoP-Conditional Refinement | component | Tracker: pending | M | 8–13 | E-C3, E-C7 | +| 16 | E-C4 | C4 Pose Estimator | component | Tracker: pending | M | 13–21 | E-C3.5, E-C5 (shared GTSAM substrate; co-developed) | +| 17 | E-C5 | C5 State Estimator | component | Tracker: pending | XL | 34–55 | E-C1, E-C4 (shared graph), E-CC-FDR-CLIENT | +| 18 | E-C8 | C8 FC + GCS Adapter | component | Tracker: pending | L | 21–34 | E-C5, E-CC-CONF, E-CC-LOG | +| 19 | E-BBT | Blackbox Tests (FT/NFT scenarios) | tests | Tracker: pending | M | 13–21 | every component epic ships its component-internal tests under its own epic; this one parents the suite-level FT/NFT scenarios in `_docs/02_document/tests/*.md` | + +## High-level component dependency diagram + +```mermaid +flowchart TB + BOOT[E-BOOT Bootstrap] + LOG[E-CC-LOG Logging] + CONF[E-CC-CONF Config + Composition Root] + FDRC[E-CC-FDR-CLIENT FDR Producer Client] + C13[E-C13 FDR] + C7[E-C7 Inference Runtime] + C6[E-C6 Tile Cache] + C11[E-C11 Tile Manager] + C10[E-C10 Cache Provisioning] + C12[E-C12 Operator Tooling] + C1[E-C1 VIO] + C2[E-C2 VPR] + C25[E-C2.5 Re-rank] + C3[E-C3 Matcher] + C35[E-C3.5 AdHoP] + C4[E-C4 Pose] + C5[E-C5 State] + C8[E-C8 FC Adapter] + BBT[E-BBT Blackbox Tests] + + BOOT --> LOG --> FDRC --> C13 + BOOT --> CONF --> C13 + BOOT --> CONF --> C7 + C13 -.-> C7 + CONF --> C6 --> C11 + C6 --> C10 + C7 --> C10 + C10 --> C12 + C11 --> C12 + C7 --> C2 --> C25 --> C3 --> C35 --> C4 + C6 --> C2 + C6 --> C25 + C1 --> C5 + C4 <--> C5 + C5 --> C8 + FDRC --> C1 + FDRC --> C5 + C8 --> BBT + C12 --> BBT +``` + +--- + +## E-BOOT — Bootstrap & Initial Structure + +**Tracker**: pending +**Type**: bootstrap +**T-shirt**: M | **Story points**: 13–21 +**Owner**: onboard team + +### System context + +```mermaid +flowchart LR + EBOOT[E-BOOT scaffolding] --> SRC[src/ component dirs] + EBOOT --> CICD[CI Tier-1 + Tier-2 jobs] + EBOOT --> DOCKER[docker-compose.test.yml] + EBOOT --> DB[Postgres init scripts] + EBOOT --> TESTROOT[tests/ + tests/fixtures/] +``` + +### Problem / Context + +No source layout exists yet. Every downstream epic assumes a defined repo skeleton: `src/components/_/`, `src/shared//`, `tests/`, `tests/fixtures/`, plus the Tier-1 Docker compose, the Tier-2 CI job, the Postgres init scripts that match `data_model.md`, and the operator-tooling tarball build path. Until this exists, no other epic can start. + +### Scope + +**In scope**: +- Create `src/components/_/` for all 14 components with empty package init. +- Create `src/shared/{logging,config,fdr_client,crypto,calibration_loader}/` placeholders. +- `pyproject.toml` (Python) + `CMakeLists.txt` (C++ where used by C1) with the project's pinned dep set. +- Tier-1 `docker-compose.test.yml` skeleton (companion + Postgres + e2e-runner; mock-suite-sat-service compose pulled in only by upload tests). +- Tier-2 CI job that runs on the bench Jetson runner, with the JetPack 6.2 / TRT 10.3 / SM 87 image pinned per ADR-005. +- Postgres init scripts for the schema in `data_model.md`. +- `tests/` directory with `tests/fixtures/`, `tests/tmp/`, `tests/conftest.py`. +- Empty `runtime_root.py` for the airborne composition root + `operator_tool/__main__.py` for the operator side. +- `.gitignore` covering binaries, engine caches, FDR segments, ephemeral keys. +- README with run commands. + +**Out of scope**: +- Any per-component logic (each component's epic owns its own implementation). +- Cross-cutting impl (logging / config / FDR client live in their own epics). + +### Architecture notes + +- ADR-005 (Tier-1 / Tier-2 are first-class) drives the CI split. +- ADR-009 (composition root) places `runtime_root.py` at the airborne entrypoint and `operator_tool/__main__.py` at the operator side. +- ADR-002 (build-time exclusion) requires per-implementation CMake `BUILD_*` flags and the SBOM diff to be wired in CI from day one. +- ADR-004 (process isolation) requires the airborne build target to refuse `c11_tilemanager/` symbols. SBOM diff hook lives here from Bootstrap onward. + +### Interface specification + +This epic exposes no runtime interface; it ships repository scaffolding only. + +### Data flow + +N/A. + +### Dependencies + +- Epic dependencies: none. +- External: GitHub Actions runner pool (Tier-1 Docker), bench Jetson runner (Tier-2), pinned base images (JetPack 6.2, Postgres 16, mcr.microsoft.com/dotnet/aspnet:8.0-alpine for the test fixture). + +### Acceptance criteria + +- `docker compose -f docker-compose.test.yml up -d` brings up companion + Postgres + e2e-runner cleanly on a fresh workstation. +- Tier-2 CI smoke-job (`echo $JETPACK_VERSION` + `nvidia-smi`) passes on the bench Jetson. +- `pytest tests/ -q --collect-only` discovers the empty `tests/` tree without errors. +- The SBOM diff CI step exists and fails the build if `c11_tilemanager` ever appears in the airborne `production-binary` artifact (R02 enforcement seed). +- `runtime_root.py` runs and exits cleanly with a "no components configured" message (proves composition root wiring). + +### Non-functional requirements + +- CI cold-build wall-clock ≤ 10 min on Tier-1; ≤ 6 min on Tier-2 (just the smoke-job). +- Repo size at this stage ≤ 5 MB (no fixtures committed). + +### Risks & mitigations + +- **R12** (single deployment camera) — Bootstrap's CI must not assume the unit is plugged in; Tier-2 smoke-job runs without the camera, only against TRT/SM/JP version. + +### Effort + +T-shirt M; 13–21 story points across child PBIs (each ≤ 5 points). + +### Child issues (PBIs) + +| # | Title | Pts | +|---|-------|-----| +| 1 | Repo scaffolding: `src/components/`, `src/shared/`, `tests/`, `runtime_root.py` | 2 | +| 2 | `pyproject.toml` + `CMakeLists.txt` with pinned deps | 3 | +| 3 | Tier-1 `docker-compose.test.yml` skeleton + Postgres init | 3 | +| 4 | Tier-2 CI smoke-job on bench Jetson | 3 | +| 5 | SBOM diff CI step (R02 enforcement seed; fails on `c11_tilemanager` in airborne artifact) | 3 | +| 6 | `.gitignore` + `README.md` + run commands | 2 | +| 7 | `runtime_root.py` minimum (compose root + "no components configured" exit path) | 2 | + +### Key constraints + +- RESTRICT-HW-1 (Jetson Orin Nano Super, 8 GB shared LPDDR5, 25 W) — Tier-2 image pins SM 87 / JP 6.2 / TRT 10.3. +- RESTRICT-FC-1 (AP + iNav supported; PX4 out of scope) — composition root wires only AP + iNav adapters. + +### Testing strategy + +- CI smoke tests on every PR (Tier-1 compose-up, Tier-2 nvidia-smi). +- No unit tests yet — those live in component epics. + +--- + +## E-CC-LOG — Cross-Cutting: Structured JSON Logging + +**Tracker**: pending +**Type**: cross-cutting +**T-shirt**: S | **Story points**: 5–8 + +### System context + +Every component's `§ 9 Logging Strategy` mandates structured JSON logging at ERROR / WARN / INFO / DEBUG levels with per-frame fields (`frame_id`, `kind`, component-specific keys). A single shared logger module under `src/shared/logging/` produces these records; every component imports it. + +```mermaid +flowchart LR + COMP[Any component] --> LOGGER[src/shared/logging
structured JSON] + LOGGER --> STDOUT[stdout / journald] + LOGGER --> FDR[FDR (via E-CC-FDR-CLIENT for ERROR + WARN)] +``` + +### Problem / Context + +If every component rolls its own logger, format drift is guaranteed. The `traceability-matrix` and post-flight FDR analysis rely on a stable JSON schema; a shared logger is the only honest way. + +### Scope + +**In scope**: +- `src/shared/logging/__init__.py` exporting `get_logger(component_id: str) -> Logger`. +- JSON formatter with stable field ordering (`ts, level, component, frame_id, kind, msg, ...kv`). +- Drop-in `RotatingStdoutHandler` for Tier-1 dev; `JournaldHandler` for Tier-2 production. +- Bridge into the FDR client for ERROR + WARN levels (handler subscribes to log records and enqueues a `kind = "log"` FdrRecord). +- Helpers for the documented per-frame log shapes (`vio.frame_id`, `vpr.top10_distances`, etc.) so component code is short. + +**Out of scope**: per-component log content (lives in each component epic's child PBIs). + +### Architecture notes + +Stdlib `logging` + `python-json-logger` (or `orjson` formatter for speed). No new dependency beyond what's already in `pyproject.toml`. No third-party log aggregator — Tier-1 uses Docker stdout capture; Tier-2 uses journald. + +### Interface specification + +```python +def get_logger(component_id: str) -> logging.Logger: ... + +class StructuredJsonHandler(logging.Handler): + """JSON formatter + FDR bridge for ERROR/WARN.""" + +class FdrLogBridge: + """Subscribed by the logger; forwards ERROR + WARN to E-CC-FDR-CLIENT.enqueue.""" +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C as Component + participant L as Logger + participant S as stdout + participant F as FDR Client + C->>L: log.warn("VPR top-1 above threshold", distance=0.42) + L->>S: {"level":"WARN", "component":"c2", ...} + L->>F: enqueue(kind="log", level="WARN", payload=...) +``` + +### Dependencies + +- Depends on E-BOOT. +- External: `python-json-logger` or `orjson` (whichever is already pinned). + +### Acceptance criteria + +- Every component test that asserts a log message uses the shared logger and finds the expected JSON shape. +- ERROR + WARN records appear in FDR with `kind = "log"` and a back-reference to the originating component. +- INFO + DEBUG do NOT appear in FDR (per-component § 9 storage rule). +- Log format passes a contract test (`tests/contract/log_schema.py`) verifying field names + ordering + required keys. + +### Non-functional requirements + +- Per-record latency p99 ≤ 0.2 ms (lock-free emit on the hot path). +- No allocation in the steady-state DEBUG path beyond the message string itself. + +### Risks & mitigations + +- **R13** (FDR queue overrun) — the FDR bridge uses E-CC-FDR-CLIENT's drop-oldest semantics; it never blocks the caller. + +### Effort + +T-shirt S; 5–8 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `src/shared/logging/` module + JSON formatter + handlers | 3 | +| 2 | FDR log bridge (ERROR + WARN → kind=log) | 2 | +| 3 | Contract test `tests/contract/log_schema.py` | 2 | + +### Key constraints + +- AC-NEW-3 (FDR ≤ 64 GB / flight, no silent drops) — DEBUG must not flow into FDR; verified by the contract test. + +### Testing strategy + +- Unit tests for the formatter (field ordering + escaping). +- Contract test against the FDR record schema (kind=log). +- Integration via every component's tests.md (each component asserts at least one log message). + +--- + +## E-CC-CONF — Cross-Cutting: Configuration & Composition Root + +**Tracker**: pending +**Type**: cross-cutting +**T-shirt**: S | **Story points**: 5–8 + +### System context + +ADR-001 (runtime selection by config) + ADR-009 (composition root) together require a single shared loader that materialises the `Config` object at process startup, plus a `compose_root(config)` function that constructs each strategy/component instance with its dependencies. No component instantiates another component itself. + +```mermaid +flowchart LR + ENV[ENV vars] --> LOADER + YAML[config.yaml] --> LOADER + CALIB[Camera calibration JSON] --> LOADER + LOADER[src/shared/config/loader] --> ROOT[runtime_root.py / operator_tool/__main__.py] + ROOT --> COMPS[component instances] +``` + +### Problem / Context + +Without a single source of truth for configuration, the BUILD_* + runtime-strategy-selection rules of ADR-001/002/009 collapse — components silently fall back to defaults, and the composition root grows local config-parsing logic that drifts. The CI gate that ensures only the *linked* strategies are selectable also lives here. + +### Scope + +**In scope**: +- `src/shared/config/loader.py`: env + YAML + camera-calibration JSON merging with explicit precedence (env > YAML > defaults). +- `Config` dataclass (frozen) covering every component's startup knob. +- `compose_root(config) -> RuntimeRoot` for the airborne process; `compose_operator(config) -> OperatorRoot` for the tooling side. +- Strategy-vs-build-flag consistency check at startup: refuse to start if config selects a strategy whose `BUILD_*` flag was off in the linked binary. + +**Out of scope**: any component's specific config shape (defined inside its own epic). + +### Architecture notes + +- ADR-001, ADR-002, ADR-009 all converge here. +- The composition root is the only place `import` of a concrete `VioStrategy` / `VprStrategy` / etc. is allowed; component code imports the abstract interface only. + +### Interface specification + +```python +@frozen +class Config: ... # populated by union of every component's config schema + +def load_config(env: dict[str, str], paths: list[Path]) -> Config: ... + +def compose_root(config: Config) -> RuntimeRoot: ... +def compose_operator(config: Config) -> OperatorRoot: ... +``` + +### Data flow + +Startup-only — runs once per process. No per-frame path. + +### Dependencies + +- Depends on E-BOOT. + +### Acceptance criteria + +- `compose_root` constructs a runnable airborne process for every documented config preset (default deployment, IT-12 research-binary, smoke-test minimal). +- Strategy/build-flag mismatch triggers an explicit `StrategyNotLinkedError` with a clear message (no silent fallback). +- Config precedence (env > YAML > defaults) verified by unit tests for at least 3 keys per layer. +- `runtime_root.py` exits with code 0 when given a valid config and no components actually do work (reachability proof). + +### Non-functional requirements + +- Cold-start config load + compose ≤ 1 s on Tier-2 (counts toward AC-NEW-1's 30 s budget). + +### Risks & mitigations + +- **R02** (ADR-004 process isolation) — compose_root's strategy/build-flag check is the third enforcement gate (after SBOM diff and runtime self-check) preventing C11 from running airborne. + +### Effort + +T-shirt S; 5–8 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `src/shared/config/loader.py` + `Config` dataclass | 3 | +| 2 | `compose_root` + `compose_operator` skeletons + StrategyNotLinkedError | 3 | +| 3 | Unit tests for env/YAML/defaults precedence | 2 | + +### Key constraints + +- ADR-002 (build-time exclusion) — only linked strategies selectable. + +### Testing strategy + +- Unit: precedence + `StrategyNotLinkedError`. +- Integration: every documented preset starts cleanly. + +--- + +## E-CC-FDR-CLIENT — Cross-Cutting: FDR Producer Client + +**Tracker**: pending +**Type**: cross-cutting +**T-shirt**: M | **Story points**: 8–13 + +### System context + +C13 owns the FDR writer thread, segment files, and the 64 GB cap. Every other component publishes via a producer-side client: lock-free enqueue + an `FdrRecord` schema versioned in `RecordSchema`. This epic owns ONLY the producer side; the writer-thread internals belong to E-C13. + +```mermaid +flowchart LR + PROD[Component producer] --> Q[lock-free ring buffer] + Q --> WRITER[E-C13 writer thread] + WRITER --> SEG[segment file on NVM] +``` + +### Problem / Context + +Producer-side correctness (drop-oldest with rollover-log, schema versioning, never-block) is independent of where the file lands. Co-locating producer logic inside E-C13 would force every component test to spin up the writer thread; a thin shared client lets component tests use a fake sink. + +### Scope + +**In scope**: +- `src/shared/fdr_client/__init__.py` exporting `FdrClient(producer_id: str) -> Client`. +- Lock-free SPSC ring buffer per producer; capacity configurable (default per producer in `Config`). +- `FdrRecord` versioned schema (orjson or msgpack — pinned in E-BOOT). +- Drop-oldest behaviour writing a structured `kind=overrun` record with `producer_id` + dropped count (never silent). +- `FakeFdrSink` for component-level tests. + +**Out of scope**: writer thread, segment files, 64 GB cap, rollover policy (E-C13). + +### Architecture notes + +- AC-NEW-3 (no silent drops) is enforced HERE: drop-oldest always emits the overrun record. +- Schema versioning prevents post-flight tooling breakage when payload classes evolve. + +### Interface specification + +```python +class FdrClient: + def __init__(self, producer_id: str): ... + def enqueue(self, record: FdrRecord) -> None: ... # lock-free, never blocks + def flush(self) -> None: ... # used by tests only +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C as Component + participant Q as Ring buffer + participant W as Writer (E-C13) + C->>Q: enqueue(record) + alt overrun + Q->>Q: drop oldest + emit kind=overrun record + end + W->>Q: dequeue (in writer thread) +``` + +### Dependencies + +- Depends on E-BOOT, E-CC-LOG. +- Consumed by every component that emits FDR records. + +### Acceptance criteria + +- `enqueue` never blocks even under writer-thread stall (verified by C13-IT-05 from the C13 tests.md). +- Every overrun event produces a structured record with non-zero `dropped_count` and the originating `producer_id`. +- Schema version bump (e.g., adding a new field) does not break post-flight tooling that reads at version N-1 (forward-compatible parser). + +### Non-functional requirements + +- `enqueue` p99 ≤ 5 µs on Tier-2 (no allocation on the steady-state path; pre-sized buffers). +- Per-producer ring buffer size ≤ configured cap (no unbounded growth). + +### Risks & mitigations + +- **R13** (queue overrun) — the design IS the mitigation: drop-oldest + always log. + +### Effort + +T-shirt M; 8–13 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | Lock-free SPSC ring buffer per producer | 5 | +| 2 | `FdrRecord` schema + versioned serialiser (orjson/msgpack) | 3 | +| 3 | Drop-oldest + `kind=overrun` record emission | 2 | +| 4 | `FakeFdrSink` for component tests | 2 | + +### Key constraints + +- AC-NEW-3 (no silent drops). + +### Testing strategy + +- Unit: ring buffer correctness under contention; overrun record emitted. +- Property tests: forward-compat parser at version N-1. + +--- + +## E-C13 — C13 Flight Data Recorder + +**Tracker**: pending | **Type**: component | **T-shirt**: L | **Story points**: 21–34 + +### System context + +```mermaid +flowchart LR + ALL[All components] -->|enqueue via E-CC-FDR-CLIENT| Q[per-producer queues] + Q --> W[C13 writer thread] + W --> SEGS[segmented files on NVM] + SEGS -.->|post-landing| OPTOOL[E-C12 retrieval] +``` + +### Problem / Context + +Per-flight ≤ 64 GB record of every payload class onboard, no silent drops, raw frames excluded except the ≤ 0.1 Hz failed-tile thumbnail forensic exception (AC-NEW-3, AC-8.5). Single writer thread; every other component produces. + +### Scope + +**In scope**: writer thread, segment file lifecycle, 64 GB cap with oldest-segment-dropped policy, per-flight `FlightHeader` + `FlightFooter`, atomic segment rotation, mid-flight tile snapshot path, failed-tile thumbnail rate cap, refusal of takeoff when `open_flight` fails. + +**Out of scope**: producer-side enqueue (E-CC-FDR-CLIENT); post-flight retrieval UI (E-C12). + +### Architecture notes + +- File: `_docs/02_document/components/14_c13_fdr/description.md` is the canonical spec. +- `IncrementalFixedLagSmoother` from C5 publishes smoothed past-keyframes via FDR ONLY (AC-4.5 revised) — NOT into the FC stream. +- Segment rotation uses `atomicwrites`; cross-process safety on the FDR root via `filelock`. + +### Interface specification + +```python +class FdrWriter: + def open_flight(header: FlightHeader) -> None: ... # raises FdrOpenError + def write_record(record: FdrRecord) -> None: ... # lock-free; FdrQueueOverrunError logged not raised + def close_flight() -> FlightFooter: ... + def current_size_bytes() -> int: ... + def is_rolling() -> bool: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant Prods as Producers (every component) + participant Q as Per-producer queues + participant W as Writer thread + participant FS as NVM segment file + Prods->>Q: enqueue(record) + W->>Q: dequeue + W->>FS: serialise + append + alt segment >= cap + W->>FS: atomic rotate; drop oldest if total > 64 GB; emit kind=segment_rollover + end +``` + +### Dependencies + +- E-BOOT, E-CC-LOG, E-CC-CONF, E-CC-FDR-CLIENT. + +### Acceptance criteria + +- AC-NEW-3: synthetic 8 h replay produces ≤ 64 GB on disk, with every drop accompanied by a `kind=overrun` and/or `kind=segment_rollover` record. +- AC-8.5: `kind=raw_nav_frame` writes raise `RawFrameWriteForbiddenError`; `kind=failed_tile_thumbnail` rate-limited to ≤ 0.1 Hz. +- AC-1.4 / AC-4.5: every smoothed past-keyframe revision lands in FDR; the FC emission stream is unchanged. +- AC-NEW-3 takeoff gate: `FdrOpenError` aborts takeoff before the FC adapter is opened. + +### Non-functional requirements + +- Writer throughput ≥ 200 Hz aggregate (per C13-PT-01). +- Per-record serialise + write p95 ≤ 5 ms. + +### Risks & mitigations + +- **R13** (queue overrun) — drop-oldest + always-log. +- **R02** (ADR-004) — C13 runs in the airborne process; no cross-process FDR root contention with C11 (C11 not airborne). + +### Effort + +T-shirt L; 21–34 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | Writer thread + segment file open/close/rotate | 5 | +| 2 | `FlightHeader` / `FlightFooter` + records-written/dropped accounting | 3 | +| 3 | 64 GB cap + oldest-segment-dropped policy + `kind=segment_rollover` record | 5 | +| 4 | Mid-flight tile snapshot path + filesystem layout | 3 | +| 5 | Failed-tile thumbnail ≤ 0.1 Hz rate limiter + AC-8.5 enforcement | 3 | +| 6 | `FdrOpenError` takeoff abort path | 2 | +| 7 | Component-internal tests C13-IT-01..06 + C13-PT-01 + C13-ST-01 | 5 | + +### Key constraints + +- AC-NEW-3, AC-8.5, AC-4.5 (revised), RESTRICT-UAV-4. + +### Testing strategy + +- Per `_docs/02_document/components/14_c13_fdr/tests.md` — six component-internal tests + 8 h NFT-LIM-02 at the suite level. + +--- + +## E-C7 — C7 On-Jetson Inference Runtime + +**Tracker**: pending | **Type**: component | **T-shirt**: L | **Story points**: 21–34 + +### System context + +```mermaid +flowchart LR + C2[C2 VPR] --> C7 + C25[C2.5 Re-rank] --> C7 + C3[C3 Matcher] --> C7 + C35[C3.5 AdHoP] --> C7 + C7 --> TRT[TensorRT] + C7 --> ORT[ONNX Runtime] + C7 --> PT[PyTorch FP16] + C7 --> THERMAL[ThermalState publish] + THERMAL --> C4[C4 Pose hybrid] +``` + +### Problem / Context + +Centralise GPU inference on Jetson: engine compilation, deserialise + warm-up, per-call inference, fallback chain (TRT → ONNX-RT+TRT-EP → PyTorch FP16), and `ThermalState` telemetry that drives D-CROSS-LATENCY-1. + +### Scope + +**In scope**: engine cache lifecycle, deserialise + warm-up budget (AC-NEW-1), `ThermalState` publisher from `jetson-stats`, D-C10-3 takeoff content-hash gate (engine-side), D-C10-7 filename-schema enforcement, ONNX-RT fallback path. + +**Out of scope**: cache artifact build (E-C10), tile cache (E-C6), the per-frame consumers (their own epics). + +### Architecture notes + +- File: `components/09_c7_inference/description.md`. +- Python in-process abstraction over C++ TRT bindings; no separate process. +- Engines hardware-tied (SM 87 / JP 6.2 / TRT 10.3 / FP16) per D-C10-6. +- Helper `EngineFilenameSchema` is shared with E-C10. + +### Interface specification + +```python +class InferenceRuntime: + def load_engine(model_id: str) -> EngineHandle: ... + def infer(handle: EngineHandle, batch: Tensor) -> Tensor: ... + def thermal_state() -> ThermalState: ... + def warm_up(handle: EngineHandle) -> None: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant Caller as C2/C2.5/C3/C3.5 + participant C7 as InferenceRuntime + participant GPU as TRT/ONNX/PT + Caller->>C7: infer(handle, batch) + C7->>GPU: forward + GPU-->>C7: output + C7-->>Caller: tensor + C7-->>C4: ThermalState pub (≥1 Hz) +``` + +### Dependencies + +- E-BOOT, E-CC-CONF, E-CC-FDR-CLIENT. +- External: TensorRT 10.3, ONNX Runtime + TRT EP, PyTorch FP16, jetson-stats. + +### Acceptance criteria + +- AC-NEW-1 cold-start: every required engine deserialises + warms in ≤ 30 s p95 (C7-IT-01). +- AC-NEW-5: `ThermalState` updates ≥ 1 Hz; throttle-detection latency ≤ 1 s; C4 hybrid switch within 1 frame (C7-IT-02). +- D-C10-3: `EngineHashMismatchError` aborts F2 takeoff; no GPU memory allocated on mismatch (C7-IT-03). +- D-C10-7: filename-schema mismatch refused at parse time (C7-IT-04). +- ONNX-RT fallback path produces correct results when TRT engine missing (C7-IT-05). + +### Non-functional requirements + +- Per-model p95 latencies (C7-PT-01): UltraVPR ≤ 60 ms, LightGlue ≤ 30 ms, AdHoP ≤ 90 ms, DISK ≤ 50 ms. +- GPU memory all engines resident ≤ 4 GB; system RAM ≤ 1.5 GB (C7-PT-02). + +### Risks & mitigations + +- **R04** (engine cache hardware-tied) — D-C10-7 + D-C10-3 enforced at C7's deserialise path. +- **R10** (Marginals under thermal throttle) — C7's `ThermalState` publish is the upstream input to the C4 hybrid. + +### Effort + +T-shirt L; 21–34 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | TRT engine load + warm-up + cache lifecycle | 5 | +| 2 | ONNX-RT + TRT-EP fallback path | 3 | +| 3 | PyTorch FP16 simple-baseline path | 3 | +| 4 | D-C10-3 content-hash gate + D-C10-7 filename schema enforcement | 3 | +| 5 | `ThermalState` publisher from `jetson-stats` | 3 | +| 6 | Component-internal tests C7-IT-01..05 + C7-PT-01..02 + C7-ST-01 | 5 | + +### Key constraints + +- RESTRICT-HW-1 (Jetson + 25 W TDP), AC-4.2 (8 GB system memory), AC-NEW-5 (thermal envelope). + +### Testing strategy + +- Per `components/09_c7_inference/tests.md`. + +--- + +## E-C6 — C6 Tile Cache + Spatial Index + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + C11[C11 TileDownloader] --> C6 + C10[C10 build] --> C6 + C5[C5 mid-flight gen] --> C6 + C2[C2 VPR] --> C6 + C25[C2.5 rerank] --> C6 + C3[C3 matcher] --> C6 + C11U[C11 TileUploader] --> C6 +``` + +### Problem / Context + +Persistent imagery store byte-identical to `satellite-provider`'s on-disk layout, plus the FAISS HNSW spatial index for VPR. Sole writer: C11 TileDownloader (production) + C5/orthorectifier (mid-flight). Sole readers: C2/C2.5/C3 (per-frame) + C11 TileUploader (post-landing). + +### Scope + +**In scope**: Postgres `tiles` schema, filesystem JPEG layout matching `satellite-provider`, FAISS HNSW build/load (the index FILE — population via C10), per-sector freshness gates at write-time, 10 GB cache budget enforcement with LRU eviction, content-SHA-256 invariant on insert, mid-flight tile insert with `quality_metadata`. + +**Out of scope**: tile fetch (E-C11), descriptor population (E-C10), inference (E-C7). + +### Architecture notes + +- File: `components/08_c6_tile_cache/description.md`, `data_model.md`. +- Schema in `data_model.md`; Postgres 16; SHA-256 sidecar via helper `Sha256Sidecar`. + +### Interface specification + +```python +class TileStore: + def insert(tile: TileRecord, jpeg: bytes) -> None: ... # raises FreshnessRejected, ContentHashMismatch + def get_tile_pixels(tile_id: TileId) -> bytes: ... + def query_spatial(bbox: Bbox, zoom: int) -> list[TileRecord]: ... + def mark_uploaded(tile_id: TileId) -> None: ... + def pending_uploads() -> list[TileRecord]: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant W as Writer (C11 / C5) + participant C6 as C6 + participant DB as Postgres + participant FS as Filesystem + W->>C6: insert(tile, jpeg) + C6->>C6: freshness gate + sha256 check + C6->>FS: write jpeg + sidecar + C6->>DB: insert row +``` + +### Dependencies + +- E-BOOT, E-CC-LOG, E-CC-CONF. +- External: PostgreSQL 16, FAISS. + +### Acceptance criteria + +- AC-8.1: filesystem layout byte-identical to `satellite-provider` for the same coordinate (C6-IT-01). +- AC-8.2 / AC-NEW-6: per-sector freshness gate rejects in active_conflict, downgrade-flags in stable_rear (C6-IT-02 / C6-IT-05). +- AC-8.4: every mid-flight tile carries `quality_metadata` (C6-IT-03). +- AC-NEW-3: peak F4 burst (5 Hz, 100 tiles) writes without dropping (C6-IT-04). +- RESTRICT-SAT-2: 10 GB cap enforced with LRU eviction, every eviction logged (C6-IT-06). +- Defensive: SHA-256 mismatch rejects insert (C6-ST-01). + +### Non-functional requirements + +- Per-tile read p95 (warm mmap) ≤ 0.5 ms; cold ≤ 50 ms (C6-PT-01). + +### Risks & mitigations + +- **R08** (freshness drift in active_conflict) — write-side gate is the primary mitigation. + +### Effort + +T-shirt M; 13–21 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | Postgres `tiles` schema + migration | 3 | +| 2 | Filesystem JPEG store byte-identical to satellite-provider | 3 | +| 3 | FAISS HNSW load/save + mmap | 3 | +| 4 | Freshness gate + sector classification | 3 | +| 5 | 10 GB LRU eviction with logging | 3 | +| 6 | Component-internal tests C6-IT-01..06 + C6-PT-01 + C6-ST-01 | 5 | + +### Key constraints + +- AC-8.1, AC-8.2, AC-NEW-6, RESTRICT-SAT-2, RESTRICT-UAV-4. + +### Testing strategy + +Per `components/08_c6_tile_cache/tests.md`. + +--- + +## E-C11 — C11 Tile Manager (TileDownloader + TileUploader) + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + SP[satellite-provider] -->|GET| DL[C11 TileDownloader] + DL --> C6[C6 cache] + C6 --> UP[C11 TileUploader] + UP -->|POST /ingest| SP + classDef airborne fill:#fee + classDef operator fill:#cef + class DL,UP operator +``` + +### Problem / Context + +Sole operator-side network I/O against `satellite-provider`, both directions. Strict ADR-004: never loaded into the airborne companion image. Bundled because download + upload share auth, HTTP client, deployment unit, and the airborne-exclusion property. + +### Scope + +**In scope**: `TileDownloader.fetch` (download → freshness gate → write to C6), `TileUploader.upload_pending` (read C6 pending → sign → POST → mark uploaded), per-flight ephemeral signing key, idempotent retry on partial-success batches, `flight_state == ON_GROUND` gate (defense-in-depth atop ADR-004). + +**Out of scope**: any airborne code; cache artifact build (E-C10); orchestration (E-C12). + +### Architecture notes + +- File: `components/12_c11_tilemanager/description.md`. +- ADR-004 enforcement via E-BOOT's SBOM diff + runtime self-check. +- Test substitute: e2e-test `mock-suite-sat-service` fixture under `tests/fixtures/` (R01). + +### Interface specification + +```python +class TileDownloader: + def fetch(req: FetchRequest) -> DownloadBatchReport: ... + +class TileUploader: + def upload_pending(flight_state: FlightStateSignal) -> UploadBatchReport: ... + # raises UploadGateBlockedError if flight_state != ON_GROUND +``` + +### Data flow + +```mermaid +sequenceDiagram + participant Op as Operator + participant DL as TileDownloader + participant SP as satellite-provider + participant C6 as C6 + Op->>DL: fetch(area, sector_classification) + DL->>SP: GET tiles + SP-->>DL: tiles + metadata + DL->>C6: insert (after freshness gate) + DL-->>Op: DownloadBatchReport +``` + +### Dependencies + +- E-C6, E-CC-CONF, E-CC-LOG. +- External: real `satellite-provider` (download); D-PROJ-2 endpoint OR e2e-test fixture (upload). + +### Acceptance criteria + +- C11-IT-01: TileDownloader fetch + freshness gate + C6 write byte-identical layout. +- C11-IT-02: stale-rejection counts surface in `DownloadBatchReport`. +- C11-IT-03: TileUploader posts pending, signs payloads, marks uploaded on 202. +- C11-IT-04: `UploadGateBlockedError` when not ON_GROUND. +- C11-IT-05: idempotent retry — already-acked tiles not re-sent. +- C11-ST-01: airborne process cannot import `c11_tilemanager` (R02 enforcement). +- C11-ST-02: NFT-SEC-02 network-egress test passes. +- C11-ST-03: per-flight key zeroised after upload. + +### Non-functional requirements + +- Download throughput ≥ 50 MB/s on 1 Gbps link (C11-PT-01). +- Upload throughput ≥ 20 tile/s with signing (C11-PT-02). + +### Risks & mitigations + +- **R01** (D-PROJ-2 not yet shipped) — TileUploader works against the e2e-test fixture; production retire when real endpoint lands. +- **R02** (ADR-004 break) — three enforcement gates; C11 tests verify each. +- **R09** (key compromise) — per-flight ephemeral keys; voting layer for compromise detection. + +### Effort + +T-shirt M; 13–21 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | TileDownloader: GET + freshness gate + C6 write | 5 | +| 2 | TileUploader: read pending + sign + POST + mark uploaded | 5 | +| 3 | Idempotent retry on partial-success batch | 3 | +| 4 | `flight_state == ON_GROUND` gate (defense-in-depth) | 2 | +| 5 | Per-flight ephemeral signing key + zeroisation | 3 | +| 6 | Component-internal tests C11-IT-01..05 + C11-PT-01..02 + C11-ST-01..03 + C11-AT-01 | 5 | + +### Key constraints + +- ADR-004, RESTRICT-SAT-1 (no in-flight Service calls), AC-8.3, AC-8.4, AC-NEW-6. + +### Testing strategy + +Per `components/12_c11_tilemanager/tests.md`. + +--- + +## E-C10 — C10 Pre-flight Cache Provisioning + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + C6[C6 already populated by C11] --> C10 + C10 --> ENGINES[TRT engines] + C10 --> DESCS[FAISS descriptors] + C10 --> MAN[signed Manifest] + ENGINES & DESCS & MAN --> AIRBORNE[airborne image at F2 takeoff] +``` + +### Problem / Context + +Build model-derived artifacts from an already-populated C6: TRT engines, VPR descriptors (calling C2's `embed_query` over the corpus), the signed Manifest with content-hashes. Idempotent re-run on unchanged C6. + +### Scope + +**In scope**: `CacheProvisioner.build_artifacts`, `ManifestVerifier.verify`, idempotence (D-C10-1), Manifest covers every shipped artifact, hardware-tied engine compile (D-C10-6), filename schema (D-C10-7), operator-key requirement. + +**Out of scope**: tile fetch (E-C11), tile cache writes (E-C6), engine deserialisation (E-C7). + +### Architecture notes + +- File: `components/11_c10_provisioning/description.md`. +- C10 narrowed in this Plan cycle: it does NOT talk to `satellite-provider`. Tiles must be present in C6 before C10 runs. + +### Interface specification + +```python +class CacheProvisioner: + def build_artifacts(corpus_root: Path, key_path: Path) -> BuildReport: ... + +class ManifestVerifier: + def verify(manifest: Path, public_key: PublicKey) -> ManifestVerdict: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant Op as Operator + participant C10 as CacheProvisioner + participant C2 as C2 embed_query + participant FS as Filesystem + Op->>C10: build_artifacts(corpus_root, key) + C10->>C2: embed every tile + C2-->>C10: descriptors + C10->>FS: write engines + faiss + manifest + C10->>FS: sign manifest with operator key + C10-->>Op: BuildReport +``` + +### Dependencies + +- E-C6, E-C7, E-CC-LOG. + +### Acceptance criteria + +- C10-IT-01: end-to-end build produces engines + descriptors + signed Manifest. +- C10-IT-02: ManifestVerifier rejects tampered or wrong-key Manifests. +- C10-IT-03: idempotent re-run — same hash, no recompile (D-C10-1). +- C10-IT-04: ManifestCoverageError on orphan files (no smuggled artifacts). +- C10-IT-05: Tier-2 build produces SM 87 / JP 6.2 / TRT 10.3 / FP16 engines (D-C10-6). +- C10-ST-01: build refuses dev-key signing in operator mode. + +### Non-functional requirements + +- Cold build wall-clock ≤ 12 min on developer laptop with NVIDIA GPU; warm idempotent re-run ≤ 1 min (C10-PT-01). + +### Risks & mitigations + +- **R04** (engine cache hardware-tied) — owner of the build side; deserialise side is C7. + +### Effort + +T-shirt M; 13–21 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | TRT engine compile (per-model) | 5 | +| 2 | FAISS descriptor population via C2's embed path | 3 | +| 3 | Signed Manifest builder + content-hash table | 3 | +| 4 | ManifestVerifier with operator-key requirement | 3 | +| 5 | Idempotent re-run + ManifestCoverageError | 3 | +| 6 | Component-internal tests C10-IT-01..05 + C10-PT-01 + C10-ST-01 | 5 | + +### Key constraints + +- AC-8.3, AC-NEW-1, D-C10-1 / D-C10-3 / D-C10-6 / D-C10-7. + +### Testing strategy + +Per `components/11_c10_provisioning/tests.md`. + +--- + +## E-C12 — C12 Operator Pre-flight Tooling + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + CLI[operator-tool CLI] + CLI --> C11D[C11 TileDownloader] + CLI --> C10[C10 CacheProvisioner] + CLI --> C11U[C11 TileUploader] + CLI --> RELOC[AC-3.4 re-loc workflow] + CLI --> FDR[FDR retrieval] +``` + +### Problem / Context + +Operator-facing CLI that sequences pre-flight (C11 download → C10 build) and post-landing (C11 upload), surfaces actionable failures, and handles the AC-3.4 re-localization workflow. Delivered as part of the operator-tooling tarball. + +### Scope + +**In scope**: CLI subcommands (`download`, `build-cache`, `upload-pending`, `reloc-confirm`), `CacheBuildReport` aggregation, post-landing `flight_state == ON_GROUND` confirmation from FDR, sector-classification UI hook, FDR retrieval helpers. + +**Out of scope**: actual download/upload (E-C11); engine compile (E-C10); FDR write side (E-C13). + +### Architecture notes + +- File: `components/13_c12_operator_tooling/description.md`. +- Strict process boundary: C12 is operator-side only, in the same image as C11, but never airborne. + +### Interface specification + +```python +class OperatorTool: + def build_cache(area: Area, sector_classification: SectorMap) -> CacheBuildReport: ... + def trigger_post_landing_upload(fdr_root: Path) -> UploadBatchReport: ... + def confirm_relocation(candidate: ReLocCandidate) -> None: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant Op as Operator + participant C12 as OperatorTool + participant C11 as C11 + participant C10 as C10 + Op->>C12: build_cache(area) + C12->>C11: TileDownloader.fetch + C11-->>C12: DownloadBatchReport + C12->>C10: build_artifacts + C10-->>C12: BuildReport + C12-->>Op: CacheBuildReport +``` + +### Dependencies + +- E-C10, E-C11, E-CC-LOG. + +### Acceptance criteria + +- C12-IT-01: operator re-loc workflow returns SUT to `satellite_anchored` ≤ 30 s (AC-3.4). +- C12-IT-02: `build_cache` orchestrates C11 then C10; download failure aborts before C10. +- C12-IT-03: `trigger_post_landing_upload` requires ≥ 30 s confirmed ON_GROUND in FDR. +- C12-IT-04: actionable failure messages + non-zero exit on stale-tile rate > 30% or manifest signature failure. +- C12-ST-01: no CLI command path imports into airborne package boundary. + +### Non-functional requirements + +- End-to-end `build_cache` wall-clock ≤ 18 min on developer laptop with NVIDIA GPU (C12-PT-01). + +### Risks & mitigations + +- **R08** (freshness drift) — actionable failure surfacing in CacheBuildReport. + +### Effort + +T-shirt M; 13–21 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | CLI scaffolding + subcommand routing | 3 | +| 2 | `build_cache` orchestration (C11 then C10) | 3 | +| 3 | `trigger_post_landing_upload` with FDR-state confirmation | 3 | +| 4 | AC-3.4 re-localization workflow | 3 | +| 5 | Actionable failure surfacing in CacheBuildReport | 2 | +| 6 | Component-internal tests C12-IT-01..04 + C12-PT-01 + C12-ST-01 + C12-AT-01 | 5 | + +### Key constraints + +- ADR-004 (C12 lives operator-side); AC-3.4, AC-8.3, AC-8.4. + +### Testing strategy + +Per `components/13_c12_operator_tooling/tests.md`. + +--- + +## E-C1 — C1 Visual / Visual-Inertial Odometry + +**Tracker**: pending | **Type**: component | **T-shirt**: XL | **Story points**: 34–55 + +### System context + +```mermaid +flowchart LR + NAVCAM[Nav camera 3 Hz] --> C1 + C8IMU[C8 ImuWindow 100-200 Hz] --> C1 + CAL[CameraCalibration] --> C1 + C1 --> C5[C5 StateEstimator] +``` + +### Problem / Context + +Per-frame relative pose SE(3) + 6×6 covariance + IMU bias estimate from nav-camera + FC IMU. Three pluggable strategies (Okvis2 production-default, VinsMono research-only, KltRansac mandatory simple-baseline) selected at startup, build-time gated, never hot-swappable. Largest single epic by complexity. + +### Scope + +**In scope**: `VioStrategy` interface + the three concrete strategies, `ImuPreintegrator` helper, warm-start path (AC-5.1), reboot recovery (AC-5.3), KltRansac as the simple-baseline AC-2.1a check, honest covariance under degradation. + +**Out of scope**: state fusion (E-C5), pose estimation (E-C4), satellite anchoring (E-C2/C3/C4 chain). + +### Architecture notes + +- File: `components/01_c1_vio/description.md`. +- Strategy + composition root + build-time exclusion (ADR-001 / ADR-002 / ADR-009). +- C++ strategies via pybind11; KltRansac thin Python wrapper around OpenCV. +- `ImuPreintegrator` shared with E-C5 (built once, used twice). + +### Interface specification + +```python +class VioStrategy(Protocol): + def process_frame(frame: NavCameraFrame, imu: ImuWindow, cal: CameraCalibration) -> VioOutput: ... + def reset_to_warm_start(pose: WarmStartPose) -> None: ... + def health_snapshot() -> VioHealth: ... +``` + +DTOs in `components/01_c1_vio/description.md` § 2. + +### Data flow + +```mermaid +sequenceDiagram + participant CAM as Nav camera + participant C1 as VioStrategy + participant C5 as C5 + participant FDR as FDR + CAM->>C1: NavCameraFrame + C1->>C1: IMU preintegrate + feature tracking + C1->>C5: VioOutput (relative pose + 6x6 cov + bias) + C1->>FDR: VioHealth (ERROR + WARN; DEBUG to stdout) +``` + +### Dependencies + +- E-BOOT, E-CC-FDR-CLIENT, E-C7 (only for the simple-baseline KltRansac path; OKVIS2 / VinsMono are CPU-bound, not GPU). + +### Acceptance criteria + +- C1-IT-01: honest cov norm rises monotonically under feature-loss event (AC-1.3 / AC-1.4). +- C1-IT-02: `VioOutput` schema invariants — SPD covariance + matched frame_id (AC-1.4). +- C1-IT-03: KltRansac ≥ 95% tracked-frame ratio on Derkachi normal segment (AC-2.1a engine rule). +- C1-IT-04: MRE p95 < 1 px frame-to-frame for Okvis2 + KltRansac (AC-2.2). +- C1-IT-05: warm-start converges within 5 frames (AC-5.1). +- C1-IT-06: F8 reboot recovery from warm-start hint without fake confidence (AC-5.3). + +### Non-functional requirements + +- C1-PT-01: `process_frame` p95 ≤ 80 ms (Okvis2) at 3 Hz on Tier-2 with C2 backbone running concurrently; throughput ≥ 3 Hz sustained. +- CPU ≤ 30% one core; memory ≤ 1.5 GB resident. + +### Risks & mitigations + +- **R10** (latency under thermal throttle) — C1's budget partition is fixed; thermal-driven hybrid lives in C4. +- **R12** (single deployment camera) — KltRansac engine-rule path stays camera-agnostic; comparative IT-12 study uses static fixtures. + +### Effort + +T-shirt XL; 34–55 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `VioStrategy` interface + composition wiring | 3 | +| 2 | OKVIS2 strategy (pybind11 binding + integration) | 5 | +| 3 | VinsMono strategy (research-only; behind BUILD_VINS_MONO) | 5 | +| 4 | KltRansac simple-baseline strategy | 5 | +| 5 | `ImuPreintegrator` helper (shared with C5) | 3 | +| 6 | Warm-start + F8 reboot recovery paths | 3 | +| 7 | Honest-covariance contract tests | 3 | +| 8 | Component-internal tests C1-IT-01..06 + C1-PT-01 | 5 | + +### Key constraints + +- AC-1.3, AC-1.4, AC-2.1a, AC-2.2, AC-4.1, AC-5.1, AC-5.3; RESTRICT-UAV-3 (sharp turns < 5% overlap). + +### Testing strategy + +Per `components/01_c1_vio/tests.md` + suite-level FT-P-02 / FT-P-04 / FT-P-05. + +--- + +## E-C2 — C2 Visual Place Recognition + +**Tracker**: pending | **Type**: component | **T-shirt**: L | **Story points**: 21–34 + +### System context + +```mermaid +flowchart LR + CAM[Nav camera] --> C2 + C7[C7 backbone] --> C2 + C6[C6 FAISS index] --> C2 + C2 --> C25[C2.5 Re-rank] +``` + +### Problem / Context + +Top-K=10 candidate retrieval from the pre-cached corpus by descriptor similarity. UltraVPR primary, MegaLoc secondary, NetVLAD mandatory simple-baseline. Boundary between cheap retrieval and expensive matching. + +### Scope + +**In scope**: `VprStrategy` + multiple backbones, FAISS HNSW lookup, descriptor pre-processing (resize/crop/normalise), L2 normalisation via `DescriptorNormaliser`, descriptor population entry-point used by C10. + +**Out of scope**: re-rank (E-C2.5), matching (E-C3), index build (E-C10). + +### Architecture notes + +- File: `components/02_c2_vpr/description.md`. +- Strategy + ADR-001/002/009. + +### Interface specification + +```python +class VprStrategy(Protocol): + def embed_query(frame: NavCameraFrame, cal: CameraCalibration) -> VprQuery: ... + def retrieve_topk(query: VprQuery, k: int) -> VprResult: ... + def descriptor_dim() -> int: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant CAM as Nav camera + participant C2 as VprStrategy + participant C7 as C7 + participant C6 as FAISS + CAM->>C2: NavCameraFrame + C2->>C7: backbone forward + C7-->>C2: embedding + C2->>C6: HNSW search k=10 + C6-->>C2: candidates + C2-->>C25: VprResult +``` + +### Dependencies + +- E-C6, E-C7, E-CC-FDR-CLIENT. + +### Acceptance criteria + +- C2-IT-01: UltraVPR recall@10 ≥ 0.95; NetVLAD ≥ 0.85 on Derkachi (AC-2.1b + engine rule). +- C2-IT-02: `VprResult` invariants (length, sorted distances, label). +- C2-IT-03: poisoned-tile top-1 rate within AC-NEW-7 relaxed CI. +- C2-IT-04: scale-ratio ±20% recall@10 ≥ 0.85 (AC-8.6 scale half). +- C2-ST-01: index handle invalidation rejected with `IndexUnavailableError`. + +### Non-functional requirements + +- C2-PT-01: `embed_query` p95 ≤ 60 ms; `retrieve_topk` p95 ≤ 2 ms; combined ≤ 65 ms (AC-4.1 partition). +- GPU ≤ 600 MB resident; system mem ≤ 200 MB for index handle. + +### Risks & mitigations + +- **R06** (VPR top-1 false positive) — C2.5 + C3 + AC-NEW-7 downstream. + +### Effort + +T-shirt L; 21–34 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `VprStrategy` interface + composition | 3 | +| 2 | UltraVPR backbone (TRT) | 5 | +| 3 | MegaLoc, MixVPR, SelaVPR, EigenPlaces secondary backbones | 5 | +| 4 | NetVLAD mandatory simple-baseline | 3 | +| 5 | FAISS HNSW load + lookup wiring | 3 | +| 6 | `DescriptorNormaliser` helper (shared with C10) | 2 | +| 7 | Component-internal tests C2-IT-01..04 + C2-PT-01 + C2-ST-01 | 5 | + +### Key constraints + +- AC-2.1b, AC-2.2, AC-4.1, AC-8.6, AC-NEW-7. + +### Testing strategy + +Per `components/02_c2_vpr/tests.md`. + +--- + +## E-C2.5 — C2.5 Inlier-based Re-rank + +**Tracker**: pending | **Type**: component | **T-shirt**: S | **Story points**: 5–8 + +### System context + +```mermaid +flowchart LR + C2[C2 K=10] --> C25 + C7[C7 LightGlueRuntime helper] --> C25 + C6[C6 tile pixels] --> C25 + C25 --> C3[C3 N=3] +``` + +### Problem / Context + +K=10 → N=3 by single-pair LightGlue inlier count. Boundary between cheap retrieval and expensive matching. Shares `LightGlueRuntime` helper with C3 (R14 — owned by helper, not by either component). + +### Scope + +**In scope**: `ReRankStrategy` + `InlierCountReRanker`, drop-and-continue on per-candidate failure. + +**Out of scope**: matching itself (E-C3); LightGlue runtime ownership (the helper is its own module). + +### Architecture notes + +- File: `components/03_c2_5_rerank/description.md`. +- Helper-ownership decision documented in R14 / risk_mitigations.md. + +### Interface specification + +```python +class ReRankStrategy(Protocol): + def rerank(frame: NavCameraFrame, vpr_result: VprResult, n: int) -> RerankResult: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C2 as C2 + participant C25 as C2.5 + participant LG as LightGlueRuntime helper + participant C6 as C6 + C2->>C25: VprResult (k=10) + loop 10 candidates + C25->>C6: get_tile_pixels + C25->>LG: single-pair inlier count + LG-->>C25: inlier count + end + C25-->>C3: top-N=3 by inlier count +``` + +### Dependencies + +- E-C2, E-C7, E-C6, shared `LightGlueRuntime` helper (with C3). + +### Acceptance criteria + +- C2.5-IT-01: top-1 promotion rate ≥ 0.98 (rerank rarely overrides correct C2 top-1). +- C2.5-IT-02: drop-and-continue on per-candidate `RerankBackboneError`. +- C2.5-IT-03: shared `LightGlueRuntime` serial-access invariant (no deadlock; bit-identical to single-threaded). + +### Non-functional requirements + +- C2.5-PT-01: `rerank` p95 ≤ 80 ms for 10 single-pair LightGlue passes; engine reuse single instance across calls. +- GPU mem ≤ 300 MB shared LightGlue engine. + +### Risks & mitigations + +- **R14** (apparent C2.5↔C3 cycle) — resolved this iteration via helper ownership. + +### Effort + +T-shirt S; 5–8 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `InlierCountReRanker` + drop-and-continue | 3 | +| 2 | Shared `LightGlueRuntime` helper module | 3 | +| 3 | Component-internal tests C2.5-IT-01..03 + C2.5-PT-01 | 2 | + +### Key constraints + +- AC-2.1b, AC-4.1, AC-NEW-7. + +### Testing strategy + +Per `components/03_c2_5_rerank/tests.md`. + +--- + +## E-C3 — C3 Cross-Domain Matcher + +**Tracker**: pending | **Type**: component | **T-shirt**: L | **Story points**: 21–34 + +### System context + +```mermaid +flowchart LR + C25[C2.5 N=3] --> C3 + C7[C7] --> C3 + CAL[CameraCalibration] --> C3 + C6[C6 tiles] --> C3 + C3 --> C35[C3.5 AdHoP] +``` + +### Problem / Context + +2D-3D correspondences between nav-camera and the top-N=3 satellite tiles, with RANSAC inliers + reprojection residual. Dominant compute cost in F3. Backbone choice locked (DISK+LightGlue per D-C3-1 = (a)) pending IT-12 verdict. + +### Scope + +**In scope**: `CrossDomainMatcher` + DISK+LightGlue (primary) + ALIKED+LightGlue (secondary) + XFeat (alternate); RANSAC + reprojection residual via `RansacFilter` helper; `InsufficientInliersError` propagation. + +**Out of scope**: refinement (E-C3.5); pose estimation (E-C4); LightGlue runtime ownership (helper). + +### Architecture notes + +- File: `components/04_c3_matcher/description.md`. + +### Interface specification + +```python +class CrossDomainMatcher(Protocol): + def match(frame: NavCameraFrame, rerank: RerankResult, cal: CameraCalibration) -> MatchResult: ... + def health_snapshot() -> MatcherHealth: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C25 as C2.5 + participant C3 as C3 + participant C7 as C7 + C25->>C3: RerankResult (n=3) + loop 3 candidates + C3->>C7: backbone forward + C3->>C3: RANSAC + residual + end + C3-->>C35: MatchResult (best by inlier count) +``` + +### Dependencies + +- E-C2.5, E-C7, shared `LightGlueRuntime` helper, shared `RansacFilter` helper. + +### Acceptance criteria + +- C3-IT-01: best-candidate inlier count p5 ≥ 80 (AC-1.1 partition). +- C3-IT-02: deterministic `best_candidate_idx == argmax(inlier_count)` with deterministic tie-break. +- C3-IT-03: cross-domain MRE p95 < 2.5 px (AC-2.2). +- C3-IT-04: tilt ±20° + 350 m outliers — inlier count p10 ≥ 40 (AC-3.1). +- C3-IT-05: `InsufficientInliersError` propagation when all N=3 fail. + +### Non-functional requirements + +- C3-PT-01: `match` p95 ≤ 180 ms; per-candidate ≤ 60 ms; throughput ≥ 3 Hz; GPU mem ≤ 800 MB combined. + +### Risks & mitigations + +- **R06** (false positive) — RANSAC + residual + downstream AC-NEW-7. +- **R10** (Marginals under throttle) — D-CROSS-LATENCY-1 hybrid touches C4 not C3 (C3's budget is fixed). + +### Effort + +T-shirt L; 21–34 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `CrossDomainMatcher` interface + composition | 3 | +| 2 | DISK+LightGlue primary | 5 | +| 3 | ALIKED+LightGlue secondary | 3 | +| 4 | XFeat alternate (lightweight) | 3 | +| 5 | `RansacFilter` helper (shared C3/C3.5/C4) | 3 | +| 6 | Component-internal tests C3-IT-01..05 + C3-PT-01 | 5 | + +### Key constraints + +- AC-1.1, AC-2.2, AC-3.1, AC-4.1. + +### Testing strategy + +Per `components/04_c3_matcher/tests.md`. + +--- + +## E-C3.5 — C3.5 AdHoP-Conditional Refinement + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 8–13 + +### System context + +```mermaid +flowchart LR + C3[C3 MatchResult] --> C35 + C7[C7 AdHoP backbone] --> C35 + C35 --> C4[C4 Pose] +``` + +### Problem / Context + +Conditional perspective preconditioning when residual exceeds threshold; passthrough otherwise. Preserves AC-4.1 budget on the steady-state path while keeping refinement for hard frames. + +### Scope + +**In scope**: `ConditionalRefiner` + `AdHoPRefiner` + `PassthroughRefiner` (both linked); residual threshold configuration; passthrough fall-through on `RefinerBackboneError`. + +**Out of scope**: matcher (E-C3); pose (E-C4). + +### Architecture notes + +- File: `components/05_c3_5_adhop/description.md`. +- Both implementations linked into the deployment binary; runtime gate is a config knob. + +### Interface specification + +```python +class ConditionalRefiner(Protocol): + def refine_if_needed(frame: NavCameraFrame, mr: MatchResult, threshold: float) -> MatchResult: ... + def was_invoked() -> bool: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C3 as C3 + participant C35 as C3.5 + participant C7 as C7 + C3->>C35: MatchResult (residual=R) + alt R > threshold + C35->>C7: AdHoP backbone forward + C7-->>C35: refined correspondences + C35-->>C4: enriched MatchResult + else + C35-->>C4: passthrough MatchResult + end +``` + +### Dependencies + +- E-C3, E-C7. + +### Acceptance criteria + +- C3.5-IT-01: residual reduction ≥ 90% of invocations (AC-2.2 hard-frame portion). +- C3.5-IT-02: passthrough fall-through on `RefinerBackboneError` with bit-identical correspondences. +- C3.5-IT-03: invocation rate < 0.30 on Derkachi normal segment. + +### Non-functional requirements + +- C3.5-PT-01: invoked p95 ≤ 90 ms; passthrough p95 ≤ 0.5 ms; aggregated added latency ≤ 25 ms. + +### Risks & mitigations + +- **R10** (latency under throttle) — threshold tunable via operator-tooling pre-flight. + +### Effort + +T-shirt M; 8–13 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `AdHoPRefiner` (TRT engine + perspective preconditioning) | 5 | +| 2 | `PassthroughRefiner` no-op | 1 | +| 3 | Conditional gate + passthrough fall-through | 2 | +| 4 | Component-internal tests C3.5-IT-01..03 + C3.5-PT-01 | 3 | + +### Key constraints + +- AC-2.2, AC-4.1. + +### Testing strategy + +Per `components/05_c3_5_adhop/tests.md`. + +--- + +## E-C4 — C4 Pose Estimator + +**Tracker**: pending | **Type**: component | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + C35[C3.5] --> C4 + CAL[CameraCalibration] --> C4 + C7[C7 ThermalState] --> C4 + C5GRAPH[C5 iSAM2 graph] --> C4 + C4 --> C5[C5 add_pose_anchor] +``` + +### Problem / Context + +Convert `MatchResult` into `PoseEstimate` (WGS84 + 6×6 covariance + provenance label). OpenCV `solvePnPRansac` + GTSAM `Marginals` for native 6×6; D-CROSS-LATENCY-1 hybrid degrades to Jacobian under thermal throttle. + +### Scope + +**In scope**: `OpenCVGtsamPoseEstimator`, GTSAM `Marginals` integration with C5's iSAM2 graph, Jacobian fallback, per-frame thermal-state-driven mode switch, `WgsConverter` helper usage. + +**Out of scope**: state fusion (E-C5); thermal telemetry source (E-C7). + +### Architecture notes + +- File: `components/06_c4_pose/description.md`. +- ADR-003 shared substrate: C4 adds factors to C5's graph; co-developed. +- ADR-006 (Jacobian fallback ~5–10% accuracy loss accepted under throttle). + +### Interface specification + +```python +class PoseEstimator(Protocol): + def estimate(mr: MatchResult, cal: CameraCalibration, thermal: ThermalState) -> PoseEstimate: ... + def current_covariance_mode() -> CovarianceMode: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C35 as C3.5 + participant C4 as C4 + participant C5 as C5 graph + participant C7 as C7 thermal + C35->>C4: MatchResult + C7-->>C4: ThermalState + alt thermal.throttle + C4->>C4: Jacobian covariance + else + C4->>C5: add factor + C5->>C5: Marginals.marginalCovariance + C5-->>C4: Sigma + end + C4-->>C5: PoseEstimate (add_pose_anchor) +``` + +### Dependencies + +- E-C3.5, E-C5 (co-developed shared substrate), shared `RansacFilter`, shared `WgsConverter`, shared `SE3Utils`. + +### Acceptance criteria + +- C4-IT-01: WGS84 accuracy p80 ≤ 50 m, p50 ≤ 20 m on Derkachi (AC-1.1 / AC-1.2). +- C4-IT-02: 6×6 SPD covariance + honest under inlier degradation (AC-1.4). +- C4-IT-03: D-CROSS-LATENCY-1 mode switch within 1 frame (AC-NEW-5 workstation portion). +- C4-IT-04: shared-graph integration with C5 — prior keyframe perturbations within tolerance. + +### Non-functional requirements + +- C4-PT-01: `estimate` p95 MARGINALS ≤ 90 ms; JACOBIAN ≤ 15 ms; switch ≤ 1 frame. + +### Risks & mitigations + +- **R10** (Marginals throttle) — primary owner of the hybrid switch. + +### Effort + +T-shirt M; 13–21 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `solvePnPRansac` + IPPE wiring | 3 | +| 2 | GTSAM `Marginals` factor add to C5 graph | 5 | +| 3 | Jacobian-degraded fallback | 3 | +| 4 | Per-frame thermal-state-driven switch | 2 | +| 5 | `WgsConverter` helper (shared with C8) | 3 | +| 6 | Component-internal tests C4-IT-01..04 + C4-PT-01 | 3 | + +### Key constraints + +- AC-1.1, AC-1.2, AC-1.4, AC-4.1, AC-NEW-5. + +### Testing strategy + +Per `components/06_c4_pose/tests.md`. + +--- + +## E-C5 — C5 State Estimator + +**Tracker**: pending | **Type**: component | **T-shirt**: XL | **Story points**: 34–55 + +### System context + +```mermaid +flowchart LR + C1[C1 VioOutput] --> C5 + C4[C4 PoseEstimate] --> C5 + C8I[C8 IMU/attitude/gps_health] --> C5 + C5 --> C8O[C8 outbound 5 Hz] + C5 --> ORTHO[Orthorectifier → C6 mid-flight tile] + C5 --> FDR[FDR smoothed history] +``` + +### Problem / Context + +Own GTSAM iSAM2 + IncrementalFixedLagSmoother (K=10–20). Fuse VIO + Pose + FC IMU into the posterior state; emit smoothed current frame to C8 + smoothed past keyframes to FDR (AC-4.5 revised, NOT FC retroactive). Spoof-promotion gate (AC-NEW-2 / AC-NEW-8). Largest epic alongside C1. + +### Scope + +**In scope**: `StateEstimator` + `GtsamIsam2StateEstimator` (production-default) + `EskfStateEstimator` (mandatory simple-baseline); spoof-promotion gate; source-label state machine; smoothed history → FDR; AC-5.2 fallback path. + +**Out of scope**: VIO (E-C1); pose (E-C4); FC adapter (E-C8); orthorectifier (lives within C5 as an internal subcomponent OR could split — kept inside C5 per the spec). + +### Architecture notes + +- File: `components/07_c5_state/description.md`. +- ADR-003 (shared GTSAM substrate with C4); co-developed. +- ADR-008 + spoof gate logic. + +### Interface specification + +```python +class StateEstimator(Protocol): + def add_vio(o: VioOutput) -> None: ... + def add_pose_anchor(p: PoseEstimate) -> None: ... + def add_fc_imu(w: ImuWindow) -> None: ... + def current_estimate() -> EstimatorOutput: ... + def smoothed_history(n: int) -> list[EstimatorOutput]: ... + def health_snapshot() -> EstimatorHealth: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant C1 as C1 + participant C4 as C4 + participant C8I as C8 inbound + participant C5 as C5 iSAM2 + participant C8O as C8 outbound + participant FDR as FDR + C1->>C5: add_vio + C4->>C5: add_pose_anchor (factor add) + C8I->>C5: add_fc_imu + C5->>C5: iSAM2 update + Marginals + C5->>C8O: current_estimate (5 Hz) + C5->>FDR: smoothed_history (per AC-4.5) +``` + +### Dependencies + +- E-C1, E-C4, E-CC-FDR-CLIENT, E-C8 inbound side, shared `ImuPreintegrator`, `SE3Utils`, `WgsConverter`. + +### Acceptance criteria + +- C5-IT-01: `last_satellite_anchor_age_ms` reset/monotonic-rise (AC-1.3 binning). +- C5-IT-02: smoothed-current honest covariance (AC-1.4). +- C5-IT-03: VIO-only fallback under matcher failure (AC-3.5). +- C5-IT-04: smoothed past-keyframes → FDR but NOT to FC stream (AC-4.5 revised). +- C5-IT-05: 3 s no-estimate triggers AC-5.2 fallback. +- C5-IT-06: spoof-promotion gate ≥ 10 s + visual consistency (AC-NEW-2). +- C5-IT-07: visual blackout + spoof escalation (AC-NEW-8). +- C5-ST-01: spoof-rejection logging cannot be silenced. + +### Non-functional requirements + +- C5-PT-01: `add_pose_anchor` + `current_estimate` p95 ≤ 60 ms; memory ≤ 100 MB resident. + +### Risks & mitigations + +- **R05** (iSAM2 silent factor-add failure) — every add logs success/false. +- **R07** (spoof premature promotion) — primary owner of the gate. + +### Effort + +T-shirt XL; 34–55 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `StateEstimator` interface + composition | 3 | +| 2 | iSAM2 + IncrementalFixedLagSmoother K=10-20 wiring | 5 | +| 3 | `BetweenFactorPose3` (VIO) + `GenericProjectionFactorCal3DS2` (pose) | 5 | +| 4 | `Marginals.marginalCovariance` integration | 3 | +| 5 | Source-label state machine + spoof-promotion gate | 5 | +| 6 | `EskfStateEstimator` mandatory simple-baseline | 5 | +| 7 | Smoothed-history → FDR path (NOT to FC) | 3 | +| 8 | AC-5.2 fallback path | 3 | +| 9 | Orthorectifier → C6 mid-flight tile gen sub-path | 3 | +| 10 | Component-internal tests C5-IT-01..07 + C5-PT-01 + C5-ST-01 | 5 | + +### Key constraints + +- AC-1.3, AC-1.4, AC-3.5, AC-4.5 (revised), AC-5.2, AC-NEW-2, AC-NEW-8. + +### Testing strategy + +Per `components/07_c5_state/tests.md`. + +--- + +## E-C8 — C8 FC + GCS Adapter + +**Tracker**: pending | **Type**: component | **T-shirt**: L | **Story points**: 21–34 + +### System context + +```mermaid +flowchart LR + FCIN[FC inbound MAVLink/MSP2] --> C8I[C8 inbound] + C8I --> C5[C5] + C8I --> C1[C1] + C5 --> C8O[C8 outbound] + C8O -->|GPS_INPUT / MSP2_SENSOR_GPS| FCOUT[FC] + C8O -->|telemetry 1-2 Hz| GCS[QGroundControl] +``` + +### Problem / Context + +Per-FC inbound + outbound. Inbound: subscribe to FC IMU/attitude/GPS-health/MAV_STATE; publish `ImuWindow`/`AttitudeWindow`/`GpsHealth`/`FlightStateSignal`. Outbound: encode `EstimatorOutput` for AP (`GPS_INPUT`) and iNav (`MSP2_SENSOR_GPS`) at 5 Hz with honest 6×6 → 2×2 covariance projection. Owns MAVLink 2.0 signing on AP wired channel (D-C8-9 = (d), R03 risk) + per-flight key rotation. Also feeds GCS at 1–2 Hz. + +### Scope + +**In scope**: `FcAdapter` + `PymavlinkArdupilotAdapter` + `Msp2InavAdapter`; `GcsAdapter` + `QgcTelemetryAdapter`; signing handshake + per-flight ephemeral key + zeroisation; D-C8-2 source-set switch (gated by IT-3); honest covariance projection. + +**Out of scope**: state estimation (E-C5); GCS workflow logic (operator side, E-C12). + +### Architecture notes + +- File: `components/10_c8_fc_adapter/description.md`. +- Both AP + iNav adapters typically linked into the deployment binary (per ADR-002 — config picks one at runtime). +- ADR-008 source-set switch gated by IT-3. + +### Interface specification + +```python +class FcAdapter(Protocol): + def open(port: PortConfig, signing_key: bytes | None) -> None: ... + def subscribe_telemetry(cb: Callable[[FcTelemetryFrame], None]) -> Subscription: ... + def emit_external_position(o: EstimatorOutput) -> None: ... + def emit_status_text(msg: str, severity: Severity) -> None: ... + def request_source_set_switch() -> None: ... # AP only + def current_flight_state() -> FlightStateSignal: ... +``` + +### Data flow + +```mermaid +sequenceDiagram + participant FC as FC + participant C8I as C8 inbound + participant C5 as C5 + participant C8O as C8 outbound + FC->>C8I: IMU + attitude + gps_health + C8I->>C5: ImuWindow / AttitudeWindow / GpsHealth + C5->>C8O: EstimatorOutput + C8O->>FC: GPS_INPUT / MSP2_SENSOR_GPS @ 5 Hz +``` + +### Dependencies + +- E-C5, E-CC-CONF, E-CC-LOG. +- External: pymavlink, MSP2 client, ArduPilot SITL, QGroundControl SITL. + +### Acceptance criteria + +- C8-IT-01: 6×6 → 2×2 honest covariance projection within 1% norm. +- C8-IT-02: 5 Hz emission jitter ≤ ±5%. +- C8-IT-03: warm-start GPS from FC EKF ≤ 1 s after C8 ready (AC-5.1). +- C8-IT-04: GCS stream 1–2 Hz (AC-6.1). +- C8-IT-05: GCS commands accepted (AC-6.2). +- C8-IT-06: WGS84 round-trip ≤ 1 cm position residual (AC-6.3). +- C8-IT-07: source-set switch ≤ 3 s of gate-clear (AC-NEW-2). +- C8-IT-08: iNav adapter never attempts signing; AP always (RESTRICT-COMM-2). +- C8-ST-01: MAVLink 2.0 signing handshake passes IT-3 SITL gate (R03). +- C8-ST-02: per-flight key never persists across flights. + +### Non-functional requirements + +- C8-PT-01: `emit_external_position` p95 ≤ 5 ms; inbound IMU callback p95 ≤ 1 ms. + +### Risks & mitigations + +- **R03** (signing handshake no precedent) — gated by IT-3; D-C8-2-FALLBACK options recorded. +- **R09** (key compromise) — per-flight ephemeral keys + zeroisation. + +### Effort + +T-shirt L; 21–34 points. + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | `FcAdapter` interface + composition | 3 | +| 2 | `PymavlinkArdupilotAdapter` outbound `GPS_INPUT` | 5 | +| 3 | `Msp2InavAdapter` outbound `MSP2_SENSOR_GPS` | 3 | +| 4 | Inbound IMU/attitude/gps_health/MAV_STATE subscription | 3 | +| 5 | Honest 6×6 → 2×2 covariance projection | 3 | +| 6 | MAVLink 2.0 per-flight signing handshake (AP) | 5 | +| 7 | Source-set switch (AP D-C8-2 gated by IT-3) | 3 | +| 8 | `GcsAdapter` + downsampled telemetry | 3 | +| 9 | Component-internal tests C8-IT-01..08 + C8-PT-01 + C8-ST-01..02 | 5 | + +### Key constraints + +- AC-4.3, AC-4.4, AC-5.1, AC-5.2, AC-6.1, AC-6.2, AC-6.3, AC-NEW-2; RESTRICT-FC-1 / FC-2 / FC-3, RESTRICT-COMM-1 / COMM-2. + +### Testing strategy + +Per `components/10_c8_fc_adapter/tests.md`. + +--- + +## E-BBT — Blackbox Tests (FT/NFT scenarios) + +**Tracker**: pending | **Type**: tests | **T-shirt**: M | **Story points**: 13–21 + +### System context + +```mermaid +flowchart LR + TESTROOT[tests/ runner] --> FTP[FT-P functional positive] + TESTROOT --> FTN[FT-N functional negative] + TESTROOT --> NFTPERF[NFT-PERF Tier-2] + TESTROOT --> NFTLIM[NFT-LIM resource] + TESTROOT --> NFTSEC[NFT-SEC security] + TESTROOT --> NFTRES[NFT-RES resilience] + TESTROOT --> IT[IT integration] +``` + +### Problem / Context + +Per-component epics ship their own component-internal unit/contract tests; this epic parents the **suite-level** scenarios already specified in `_docs/02_document/tests/*.md`. They exercise end-to-end ACs and restrictions and bind multiple components together. + +### Scope + +**In scope**: implementing the FT-P, FT-N, NFT-PERF, NFT-LIM, NFT-SEC, NFT-RES, IT scenario IDs cited in `traceability-matrix.md`. Test data setup, fixtures (Derkachi flight + AerialVL S03 + e2e-test mock-suite-sat-service), Tier-2 runner orchestration. + +**Out of scope**: per-component unit/contract tests (live in each component epic). + +### Architecture notes + +- Files: `_docs/02_document/tests/blackbox-tests.md`, `performance-tests.md`, `security-tests.md`, `resource-limit-tests.md`, `resilience-tests.md`, `environment.md`, `test-data.md`, `traceability-matrix.md`. +- Tier-1 vs Tier-2 split per ADR-005. + +### Interface specification + +Tests are pytest scenarios; no runtime interface beyond the test runner CLI. + +### Data flow + +```mermaid +sequenceDiagram + participant CI as CI runner + participant FX as Fixtures + participant SUT as System under test (compose / Tier-2 binary) + CI->>FX: stage Derkachi corpus + SITL containers + CI->>SUT: bring up + CI->>SUT: drive scenario inputs + SUT-->>CI: emitted MAVLink + FDR records + CI->>CI: assert per scenario pass criteria +``` + +### Dependencies + +- All component epics (each must ship ready-to-test). + +### Acceptance criteria + +- Every scenario ID cited in `traceability-matrix.md` exists, runs, and passes on its target tier. +- Coverage ≥ 75% gate held (currently 92.4% inclusive / 89.8% strict — confirmed pre-Step-6). +- PARTIAL / NOT COVERED rows have linked leftover entries explaining the deferral. + +### Non-functional requirements + +- Tier-1 full suite wall-clock ≤ 30 min on a developer laptop. +- Tier-2 NFT suite wall-clock ≤ 90 min on the bench Jetson. + +### Risks & mitigations + +- **R11** (statistical headroom) — NFT-RES-03 / NFT-SEC-01 use Monte-Carlo-with-CI per the AC-text relaxation. + +### Effort + +T-shirt M; 13–21 points (test implementation; scenario specs already exist). + +### Child issues + +| # | Title | Pts | +|---|-------|-----| +| 1 | Test environment scaffolding (`tests/conftest.py`, fixtures dir, Postgres + SITL bring-up) | 3 | +| 2 | FT-P-* implementation (positive functional scenarios) | 5 | +| 3 | FT-N-* implementation (negative functional scenarios) | 3 | +| 4 | NFT-PERF / NFT-LIM Tier-2 runner integration | 5 | +| 5 | NFT-SEC implementation (incl. NFT-SEC-02 network egress + NFT-SEC-01 cache poisoning) | 5 | +| 6 | NFT-RES resilience scenarios | 3 | +| 7 | IT-3 ArduPilot SITL signing handshake (R03 gate) | 5 | +| 8 | IT-12 comparative-study runner | 3 | + +### Key constraints + +- AC-NEW-3, AC-NEW-5, AC-NEW-7, RESTRICT-HW-1. + +### Testing strategy + +This epic IS the testing strategy for system-level scenarios. Per-component testing belongs to component epics. + +--- + +## Lessons applied (Step 6 step-0 retrospective) + +`_docs/LESSONS.md` does not yet exist (this is the project's first cycle), so no prior estimation/architecture/dependencies lessons were folded into the sizing above. When this cycle ends, the Final step's quality checklist should propose a lessons file capturing: + +- C2.5 ↔ C3 helper-ownership (R14) — generalisable lesson: when two siblings share a runtime, place ownership in a shared helper from day one rather than discovering the cycle in a 4a evaluator pass. +- ADR-007 reversal (mock-as-fixture) — generalisable lesson: a test fixture is not a component; promoting one inflates architectural surface and risks contract drift. +- D-PROJ-2 / D-PROJ-3 carryforwards — generalisable lesson: cross-suite design dependencies belong in `_process_leftovers/` from the moment they are recognised, with full payload so a later cycle can replay them. + +These three are candidates for the next cycle's `LESSONS.md`. + diff --git a/_docs/02_document/glossary.md b/_docs/02_document/glossary.md index 5390d52..dd577bb 100644 --- a/_docs/02_document/glossary.md +++ b/_docs/02_document/glossary.md @@ -38,7 +38,7 @@ Terms are alphabetical. Each entry: one-line definition + parenthetical source. **FDR / Flight Data Recorder** — Per-flight onboard NVM record (≤64 GB) of estimates, IMU traces, MAVLink stream, mid-flight tiles, system health, failed-tile thumbnails. Excludes raw nav/AI-camera frames. (source: AC-NEW-3) -**Flight state** — Boolean signal `IN_AIR | ON_GROUND` derived from FC `MAV_STATE` (MAVLink HEARTBEAT). Safety-critical: gates the post-landing upload path; `IN_AIR` forbids any outbound write to `satellite-provider`. Enforced primarily by process-level isolation — the upload daemon is not loaded in the airborne companion image. (source: user directive 2026-05-09) +**Flight state** — Boolean signal `IN_AIR | ON_GROUND` derived from FC `MAV_STATE` (MAVLink HEARTBEAT). Safety-critical: gates the post-landing upload path; `IN_AIR` forbids any outbound write to `satellite-provider`. Enforced primarily by process-level isolation — the Tile Manager (C11), which carries both the `TileDownloader` and the `TileUploader`, is not loaded in the airborne companion image. (source: user directive 2026-05-09) **GCS / Ground Control Station** — QGroundControl. Mission Planner is out of scope. (source: `restrictions.md`) @@ -60,9 +60,9 @@ Terms are alphabetical. Each entry: one-line definition + parenthetical source. **Nav camera / Navigation camera** — The fixed-downward (no gimbal) camera on the UAV; pinned model is `adti20`. Distinct from the operator-controlled AI camera. (source: `restrictions.md` §Cameras) -**Operator** — Pre-flight and post-flight human role: classifies the operational area (active-conflict vs stable rear), downloads tiles via `satellite-provider`, stages cache + calibration onto the companion before takeoff, and after landing triggers the post-landing upload tool. (source: `problem.md`, AC-3.4 / AC-6.2, user confirmation 2026-05-09) +**Operator** — Pre-flight and post-flight human role: classifies the operational area (active-conflict vs stable rear), drives the **Tile Manager** to download tiles from `satellite-provider`, stages calibration onto the companion before takeoff, and after landing triggers the **Tile Manager** upload run. (source: `problem.md`, AC-3.4 / AC-6.2, user confirmation 2026-05-09) -**Post-landing upload tool** — Operator-side process that runs only when `flight state == ON_GROUND`; pushes locally-saved mid-flight tiles to `satellite-provider`'s ingest endpoint. Implemented as a separate process / image so the upload code path is never loaded in the airborne companion. (source: user directive 2026-05-09) +**Tile Manager** — Operator-side component (C11) that owns both directions of network I/O against `satellite-provider`: pre-flight download (F1) into the local C6 store via the `TileDownloader` interface, and post-landing upload (F10) from C6 to the parent-suite ingest endpoint via the `TileUploader` interface (gated on `flight state == ON_GROUND`). Implemented as a separate binary / image so neither network path is loaded in the airborne companion (ADR-004 process-level isolation). Replaces the earlier "post-landing upload tool" naming after Plan-cycle scope expansion 2026-05-09. (source: user directive 2026-05-09) **`satellite-provider`** — First-class architecture boundary: the suite's existing .NET 8 REST microservice at `/Users/obezdienie001/dev/azaion/suite/satellite-provider/`. Runs in Docker (`:5100`, OpenAPI at `/swagger`); downloads Google Maps tiles; stores them in PostgreSQL + filesystem (`./tiles/{zoomLevel}/{x}/{y}.jpg`). Read-only from the onboard runtime; receives post-landing tile uploads via a yet-to-be-designed ingest endpoint (parent-suite work, D-PROJ-2). Synonym in older docs: "Suite Sat Service" / "Azaion Suite Satellite Service". (source: parent-suite `satellite-provider/README.md`, user confirmation 2026-05-09) diff --git a/_docs/02_document/risk_mitigations.md b/_docs/02_document/risk_mitigations.md new file mode 100644 index 0000000..a7f2fb3 --- /dev/null +++ b/_docs/02_document/risk_mitigations.md @@ -0,0 +1,141 @@ +# Risk Assessment — gps-denied-onboard Plan cycle — Iteration 1 + +**Date**: 2026-05-09 +**Scope**: technical / schedule / external risks identified during the 4a evaluator pass over architecture, system flows, data model, deployment, and the 14-component decomposition. +**Iteration policy**: if the user requests another round, this file becomes `risk_mitigations_02.md` and so on. + +## Risk Scoring Matrix + +| | Low Impact | Medium Impact | High Impact | +|--|------------|---------------|-------------| +| **High Probability** | Medium | High | Critical | +| **Medium Probability** | Low | Medium | High | +| **Low Probability** | Low | Low | Medium | + +## Acceptance Criteria by Risk Level + +| Level | Action Required | +|-------|----------------| +| Low | Accepted, monitored quarterly | +| Medium | Mitigation plan required before implementation | +| High | Mitigation + contingency plan required, reviewed weekly | +| Critical | Must be resolved before proceeding to next planning step | + +## Risk Register + +| ID | Risk | Category | Probability | Impact | Score | Mitigation | Owner | Status | +|----|------|----------|-------------|--------|-------|------------|-------|--------| +| R01 | D-PROJ-2 ingest endpoint not yet implemented service-side; F10 post-landing upload cannot reach a real `satellite-provider` until parent-suite work lands | External | High | Medium | High | C11 `TileUploader` keeps batches in pending-upload journal; e2e `mock-suite-sat-service` fixture exercises the contract in tests; leftover file tracks the cross-workspace dep; onboard release does not block on D-PROJ-2 | Parent suite | Mitigated | +| R02 | ADR-004 process-isolation invariant broken by accidental CMake link of C11 into the airborne image | Technical | Low | High | Medium | CI SBOM diff explicitly fails if any `c11_tilemanager/` symbol appears in the airborne `production-binary` artifact (see ADR-002 + CI security gate); runtime check in `runtime_root.py` panics if `c11_tilemanager` module imports non-test paths | Onboard team | Mitigated | +| R03 | MAVLink 2.0 per-flight signing key handshake on AP wired channel (D-C8-9 = (d)) has no production-deployed precedent | Technical | Medium | High | High | IT-3 ArduPilot SITL validation gate; D-C8-2-FALLBACK options recorded ((a) operator-manual RC aux + relaxed AC-NEW-2; (b) STATUSTEXT instead of automated switch; (c) escalate to ArduPilot dev community) per ADR-008 | Onboard team | Open (gated by IT-3) | +| R04 | TensorRT engine cache hardware-tied to SM 87; deserialising on a different SM corrupts inference silently | Technical | Medium | High | High | D-C10-7 self-describing filename `__sm_jp_trt_.engine` + D-C10-3 SHA-256 content-hash gate at F2 takeoff refuses deserialise on tuple mismatch; CI Tier-2 runs on the pinned JetPack 6.2 / TRT 10.3 / SM 87 image | Onboard team | Mitigated | +| R05 | iSAM2 numerical instability silently swallows factor-add failures, producing wrong covariance | Technical | Medium | Medium | Medium | C5 logs every `add_*` call's success/failure; `EstimatorHealth.cov_norm_growing_for_s` monotonicity check feeds the AC-NEW-8 spoof gate; `EstimatorFatalError` triggers AC-5.2 (3 s no estimate → FC IMU-only) | Onboard team | Mitigated | +| R06 | VPR top-1 false positive: visually-similar but geographically-wrong tile ranks above the true match | Technical | Medium | Medium | Medium | C2.5 inlier-count rerank narrows K=10 → N=3; C3 RANSAC + reprojection residual filter; C3.5 conditional AdHoP refinement on hard frames; D-CROSS-LATENCY-1 hybrid keeps the budget under thermal throttle. Cross-flight cache-poisoning safety budget (AC-NEW-7) gates downstream | Onboard team | Mitigated | +| R07 | FC GPS source promoted back from spoofed → trusted prematurely (AC-NEW-2 / AC-NEW-8 violation) | Technical | Low | High | Medium | C5 `SourceLabelStateMachine`: never re-promote until ≥10 s of `gps_health == STABLE_NON_SPOOFED` AND next satellite-anchored frame agrees with FC GPS within configurable tolerance; every reject logged to FDR + GCS STATUSTEXT (ADR-008) | Onboard team | Mitigated | +| R08 | Tile freshness drift inside `active_conflict` sectors between download and flight time (AC-NEW-6 violation in stale theatre) | External | Medium | Medium | Medium | C11 `TileDownloader` applies sector-classified freshness gate at fetch (reject in `active_conflict`, downgrade-no-`satellite_anchored`-label in `stable_rear`); `DownloadBatchReport` surfaces stale-rejection counts so the operator can re-pull. Manifest-hash idempotence (D-C10-1) makes re-runs cheap | Operator + onboard team | Mitigated | +| R09 | Per-flight onboard signing key compromise: a captured companion lets an attacker poison `satellite-provider` ingest | External | Low | Medium | Low | Per-flight ephemeral keys deleted on flight-ring rollover (≥30 days post-landing default); parent-suite voting layer (D-PROJ-2 design task #2) requires multi-companion agreement before promoting `pending → trusted`; operator can revoke a compromised `companion_id` | Parent suite + operator | Mitigated (depends on D-PROJ-2 #2) | +| R10 | C5 covariance recovery (`Marginals.marginalCovariance`) exceeds AC-4.1 latency budget under thermal throttle | Technical | Medium | Medium | Medium | D-CROSS-LATENCY-1 hybrid: `ThermalState` from C7 triggers C4 to switch from MARGINALS → JACOBIAN per-frame; ~5–10% accuracy loss accepted under throttle, never on the steady-state path (ADR-006). Once thermal returns below threshold, switch back next frame | Onboard team | Mitigated | +| R11 | AC-NEW-4 / AC-NEW-7 multi-flight statistical headroom is reduced-confidence with the current single-flight (Derkachi) fixture (D-PROJ-3 deferred) | Schedule / Data | High | Medium | High | Validation requirement relaxed from "≥100 flights" literal to Monte-Carlo-with-stated-CI over current corpus; multi-flight statistical residual risk recorded in this register; D-PROJ-3 carryforward to next cycle (Maxar Open Data Ukraine + AerialVL S03 + own multi-flight data) | Project lead | Open (carryforward) | +| R12 | Single deployment camera (`adti20`, one unit available) becomes a single-point-of-failure for Tier-2 NFTs | Resource | Medium | Medium | Medium | D-PROJ-1 hybrid calibration acquired per-unit; bench Jetson at HQ retains a copy of every successfully-built engine cache (D-C10-8); IT-12 comparative study uses static fixtures so it is camera-unit-agnostic. If the deployed unit fails, the cache rebuilds on a replacement unit | Onboard team | Accepted | +| R13 | C13 FDR queue overrun on a sustained burst (e.g., F4 mid-flight tile gen + per-frame estimates + IMU traces) | Technical | Low | Medium | Low | Per-producer drop-oldest queues; rollover event itself is always logged (`FdrQueueOverrunError` writes a structured "overrun" record with producer-id and dropped count); NFT-LIM-02 (8 h synthetic AC-NEW-3) validates aggregate throughput | Onboard team | Mitigated | +| R14 | Apparent C2.5 ↔ C3 build-time circular dependency around the LightGlue runtime | Technical | Low | Low | Low | Resolved: ownership of `LightGlueRuntime` moved to the shared helper (`_docs/02_document/common-helpers/03_helper_lightglue_runtime.md`); both C2.5 and C3 are sibling consumers; data flow remains C2.5 → C3 (one-way). C2.5 spec § 8 updated to reference the helper, not C3 | Onboard team | Mitigated (this iteration) | + +## Risk Categories + +### Technical Risks (R02, R03, R04, R05, R06, R07, R10, R13, R14) + +Concentrated in three areas: (a) the GPU/TRT stack and its hardware-tied engine cache; (b) the GTSAM substrate shared between C4 and C5; (c) MAVLink signing handshake. Each has a concrete mitigation already wired into the architecture. + +### Schedule / Data Risks (R11) + +Single risk: D-PROJ-3 multi-flight fixture acquisition deferred. Validation strategy adjusted from literal-count to Monte-Carlo-with-CI. Carryforward to next cycle. + +### Resource Risks (R12) + +Single deployment camera. Accepted with bench Jetson + HQ engine archive as the contingency. + +### External Risks (R01, R08, R09) + +All three depend on parent-suite work — D-PROJ-2 (#1 ingest endpoint, #2 voting layer) plus the existing `satellite-provider` GET surface. Cross-workspace dependency tracked in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md`. + +## Detailed Risk Analysis + +### R02: ADR-004 process-isolation invariant broken by accidental airborne C11 link + +**Description**: The architecture's primary defence against in-air outbound writes to `satellite-provider` is **process-level isolation** — the C11 Tile Manager (both `TileDownloader` and `TileUploader`) is excluded from the airborne CMake target so the airborne image cannot load that code path even via reflection or config error. A regression that adds a `c11_tilemanager/` import path to a shared module the airborne binary depends on would silently re-introduce the upload code into airborne memory, defeating ADR-004. + +**Trigger conditions**: +- A future refactor moves a helper used by C11 into a shared module. +- A new feature accidentally adds `c11_tilemanager` to the airborne CMake target list. +- A reflection-based plugin loader scans all `src/components/` and instantiates whatever it finds. + +**Affected components**: C11 (the deletion target), the airborne `production-binary` build, the runtime composition root. + +**Mitigation strategy**: +1. CI security gate: SBOM diff fails if any `c11_tilemanager/` symbol appears in the airborne `production-binary` artifact (extension of ADR-002's existing per-implementation enforcement). +2. Runtime self-check in `runtime_root.py`: if any `c11_tilemanager.*` module is importable from inside the airborne process, panic at startup before opening the FC adapter. +3. Test gate (NFT-SEC-02): explicit "egress test" verifies the airborne process cannot reach `satellite-provider` over the network. + +**Contingency plan**: If the runtime self-check fires in flight, the airborne process refuses to publish `GPS_INPUT` / `MSP2_SENSOR_GPS` and alerts via STATUSTEXT — defaulting back to FC IMU-only (AC-5.2 fallback path). + +**Residual risk after mitigation**: Low. + +**Documents updated**: `architecture.md` ADR-004 (extended to cover both `TileDownloader` and `TileUploader`); `architecture.md` ADR-002 (SBOM diff scope); `tests/security-tests.md` NFT-SEC-02. + +--- + +### R03: MAVLink 2.0 per-flight signing handshake (D-C8-9 = (d)) has no production precedent + +**Description**: D-C8-9 selects MAVLink 2.0 message signing (option d) on the AP wired channel with per-flight ephemeral keys. The AP firmware supports it but no flight-deployed system the project has reviewed runs it in a per-flight rotation pattern; the contract for the handshake at takeoff is project-defined. + +**Trigger conditions**: ArduPilot Plane firmware behaviour differs from documentation; the handshake fails on a real airframe. + +**Affected components**: C8 (per-FC adapter), F2 (takeoff load). + +**Mitigation strategy**: +1. **IT-3 ArduPilot SITL validation gate** — IT-3 must pass before D-C8-9 is locked. The test exercises full handshake + signed-channel + key rotation in SITL. +2. **D-C8-2-FALLBACK options recorded** (per ADR-008): (a) operator-manual RC aux switch with relaxed AC-NEW-2 wording; (b) operator-warning STATUSTEXT instead of automated switch; (c) escalate to ArduPilot dev community. + +**Contingency plan**: If IT-3 fails, fall back to (a) per ADR-008 and accept the relaxed AC-NEW-2 wording for this cycle. This is a documented planned fallback, not an emergency response. + +**Residual risk after mitigation**: Medium-Low until IT-3 passes; Low once IT-3 is green. + +**Documents updated**: `architecture.md` ADR-008; `components/10_c8_fc_adapter/description.md` § 5 (Error Handling); `tests/integration-tests.md` IT-3. + +--- + +### R11: AC-NEW-4 / AC-NEW-7 reduced-confidence statistical headroom + +**Description**: AC-NEW-4 (per-flight pose-error CDF) and AC-NEW-7 (cross-flight cache-poisoning safety budget) were originally framed as ≥100-flight statistical claims. With current data limited to a single Derkachi flight + 60 still images, the project cannot meet the literal bound and instead validates with Monte-Carlo over the current corpus, with stated CIs. + +**Trigger conditions**: Reviewer or operator interprets the original AC text strictly; the relaxed Monte-Carlo wording is not formally accepted upstream. + +**Affected components**: validation strategy across NFT-PERF-01, NFT-LIM-03, NFT-SEC-01. + +**Mitigation strategy**: +1. AC text already revised in 2026-05-09 to "Monte-Carlo over currently-available data corpus with stated CI" (per `acceptance_criteria.md` revision note). +2. D-PROJ-3 (multi-flight fixture acquisition) is the carryforward: AerialVL S03 + Maxar Open Data Ukraine + own multi-flight data when next cycle resources permit. +3. NFT specs explicitly mark this as PARTIAL coverage with traceability flag in the matrix. + +**Contingency plan**: If the relaxed wording is rejected, the project escalates D-PROJ-3 to a blocking pre-cycle dep and pauses validation NFTs until multi-flight data is acquired. + +**Residual risk after mitigation**: Medium (acknowledged headroom; acceptable for this cycle but tracked). + +**Documents updated**: `00_problem/acceptance_criteria.md` revision note; `tests/traceability-matrix.md`; `01_solution/solution.md` Plan-phase carryforward section. + +## Architecture / Component Changes Applied (this iteration) + +| Risk ID | Document Modified | Change Description | +|---------|------------------|--------------------| +| R02 | `architecture.md` ADR-004 | Extended scope: process isolation now excludes both C11 download and upload code paths from the airborne image; runtime self-check requirement noted | +| R02 | `architecture.md` ADR-002 | SBOM diff already covers per-implementation linking; explicit C11 boundary added to enforcement scope | +| R14 | `components/03_c2_5_rerank/description.md` § 1 + § 8 | LightGlueRuntime ownership moved from C3 to the shared helper; C2.5's "Must be implemented after" no longer references C3 directly | + +(R03, R04, R05, R06, R07, R08, R10, R13 already had mitigations wired into the architecture/component specs in earlier iterations of Plan; this register only cross-references them.) + +## Summary + +**Total risks identified**: 14 +**Critical**: 0 | **High**: 3 (R01, R03, R11) | **Medium**: 8 (R02, R04, R05, R06, R07, R08, R10, R12) | **Low**: 3 (R09, R13, R14) +**Risks mitigated this iteration**: 2 net-new (R02 enforcement scope extended; R14 resolved via helper ownership) +**Risks requiring user decision**: none in this iteration. R03 is gated by IT-3 (a future test event); R11 is the documented D-PROJ-3 carryforward already accepted earlier in the cycle. diff --git a/_docs/02_document/system-flows.md b/_docs/02_document/system-flows.md index 5797843..cd14ba7 100644 --- a/_docs/02_document/system-flows.md +++ b/_docs/02_document/system-flows.md @@ -9,16 +9,16 @@ | # | Flow Name | Trigger | Primary Components | Criticality | |---|-----------|---------|--------------------|-------------| -| F1 | Pre-flight cache provisioning | Operator runs C12 cache-build CLI on workstation | C12 (operator), [[`satellite-provider`]], C10, C6, C7 | High | +| F1 | Pre-flight cache provisioning | Operator runs C12 cache-build CLI on workstation | C12 (operator), C11 `TileDownloader`, [[`satellite-provider`]], C10, C6, C7 | High | | F2 | Takeoff load | Companion boot detected by FC `MAV_STATE` ARMED OR companion process start with armed FC | C10, C7, C8 (signing handshake), C13 | High | | F3 | Steady-state per-frame estimation | Nav camera frame received (3 Hz nominal) | C1, C2, C2.5, C3, C3.5, C4, C5, C8 (out), C13 | High | -| F4 | Mid-flight tile generation + local cache write | Successful satellite-anchored frame with quality metadata above threshold | C5, C6, C13 (no C8/C11 path) | High | +| F4 | Mid-flight tile generation + local cache write | Successful satellite-anchored frame with quality metadata above threshold | C5, C6, C13 (no C8/C11 path — C11 `TileUploader` is not loaded in the airborne image) | High | | F5 | Visual blackout + spoofed-GPS failsafe | Camera unusable AND/OR FC GPS reports denial/spoof | C1, C5, C8, C13 (degraded-mode escalation per AC-NEW-8) | High | | F6 | Sharp-turn / disconnected-segment re-localization | Frame-to-frame registration fails for ≥ 1 frame (AC-3.2 / AC-3.3) | C1, C2, C2.5, C3, C3.5, C4, C5, C8, C13; optionally operator (AC-3.4) | High | | F7 | Spoofing-promotion via EKF source-set switch | FC reports GPS denial/spoof while companion estimate is healthy | C5, C8, [[ArduPilot Plane FC]] | High | | F8 | Companion reboot recovery | Companion process restart while FC remains armed | C8 (FC IMU pose ingest), C5, C10 (warm-cache verify), C13 | Medium | | F9 | GCS telemetry stream | Per-frame estimate available + GCS link healthy | C5, C8, [[QGroundControl]] | Medium | -| F10 | Post-landing tile upload | Operator triggers C11 with `flight_state == ON_GROUND` confirmed | C11 (operator-side), C6 (read), [[`satellite-provider`]] (D-PROJ-2 endpoint, planned) | High | +| F10 | Post-landing tile upload | Operator triggers C11 `TileUploader` with `flight_state == ON_GROUND` confirmed | C11 `TileUploader` (operator-side), C6 (read), [[`satellite-provider`]] (D-PROJ-2 endpoint, planned) | High | ## Flow Dependencies @@ -43,7 +43,12 @@ ### Description -The operator builds (or refreshes) the per-mission cache on the companion before takeoff: downloads tiles from `satellite-provider` for the operational area, generates VPR descriptors, compiles TensorRT engines, applies sector-classified freshness rules, and writes a manifest with a SHA-256 content-hash gate. This flow is offline and not time-critical; it is the only path that reaches `satellite-provider` from the companion side. +The operator builds (or refreshes) the per-mission cache before takeoff. F1 has **two phases** sequenced by C12 OperatorTool: + +- **Phase 1 — Tile download (C11 `TileDownloader`)**: fetch tiles from `satellite-provider` for the operational area; apply sector-classified freshness rules (AC-NEW-6) and resolution gate (RESTRICT-SAT-4); write tile rows + JPEGs into C6. +- **Phase 2 — Cache artifact build (C10 CacheProvisioner)**: read the populated C6 store; compile/deserialize TRT engines via C7; batch-generate descriptors via the C2 backbone; atomically write the FAISS HNSW index with SHA-256 sidecars; write the Manifest hashing model + calibration + corpus + sector classification. + +This flow is offline and not time-critical. **Only Phase 1 reaches `satellite-provider`** — and it runs on the operator workstation, which is the only host that holds the TLS + service-internal API key. The companion never reaches `satellite-provider` directly. ### Preconditions @@ -59,25 +64,28 @@ The operator builds (or refreshes) the per-mission cache on the companion before sequenceDiagram participant Operator participant C12OperatorTool as C12 Operator Tool (workstation) + participant C11TileDownloader as C11 TileDownloader (workstation) participant SatelliteProvider as [[satellite-provider]] (.NET 8) + participant C6TileStore as C6 TileStore + DescriptorIndex (Postgres + filesystem + FAISS) participant C10Provisioner as C10 CacheProvisioner (companion) participant C7Inference as C7 InferenceRuntime participant C2Backbone as C2 VPR backbone (TensorRT) - participant C6TileStore as C6 TileStore + DescriptorIndex (Postgres + filesystem + FAISS) Operator->>C12OperatorTool: build_cache(area, sector_class, calibration_file) - C12OperatorTool->>SatelliteProvider: GET /api/satellite/tiles?bbox=&zoom= - SatelliteProvider-->>C12OperatorTool: Tile blobs + metadata (paged) - C12OperatorTool->>C12OperatorTool: filter by AC-NEW-6 freshness (sector-class-driven) - C12OperatorTool->>C10Provisioner: stage(tiles, manifest_inputs, calibration) - C10Provisioner->>C6TileStore: write tiles to ./tiles/{zoomLevel}/{x}/{y}.jpg + Postgres rows + C12OperatorTool->>C11TileDownloader: download_tiles_for_area(bbox, zooms, sector_class) + C11TileDownloader->>SatelliteProvider: GET /api/satellite/tiles?bbox=&zoom= + SatelliteProvider-->>C11TileDownloader: Tile blobs + metadata (paged) + C11TileDownloader->>C11TileDownloader: filter by AC-NEW-6 freshness + RESTRICT-SAT-4 resolution + C11TileDownloader->>C6TileStore: write tiles to ./tiles/{zoomLevel}/{x}/{y}.jpg + Postgres rows (source='googlemaps') + C11TileDownloader-->>C12OperatorTool: DownloadBatchReport (counts, freshness summary) + C12OperatorTool->>C10Provisioner: build_cache_artifacts(bbox, zooms, sector_class, calibration) C10Provisioner->>C7Inference: load VPR backbone ONNX C7Inference-->>C10Provisioner: TRT engine compiled (cached per SM/JP/TRT/precision tuple) - C10Provisioner->>C2Backbone: per-tile descriptor generation (batched on Jetson) + C10Provisioner->>C2Backbone: per-tile descriptor generation (batched on Jetson, reads tiles from C6) C2Backbone-->>C10Provisioner: descriptor matrix (FP16/INT8 per D-C7-1) C10Provisioner->>C6TileStore: faiss.write_index (HNSW) + atomicwrites + SHA-256 content-hash C10Provisioner->>C10Provisioner: write Manifest (hash of model + calibration + corpus + sector_class) - C10Provisioner-->>C12OperatorTool: provisioning report (counts, hashes, freshness summary) + C10Provisioner-->>C12OperatorTool: BuildReport (counts, hashes) C12OperatorTool-->>Operator: PASS / FAIL summary ``` @@ -86,15 +94,18 @@ sequenceDiagram ```mermaid flowchart TD Start([Operator invokes C12 build]) --> Classify[Operator classifies sector active_conflict OR stable_rear] - Classify --> Download[C12 downloads tiles from satellite-provider for bbox + zoom] + Classify --> InvokeC11[C12 invokes C11 TileDownloader] + InvokeC11 --> Download[C11 GET /api/satellite/tiles for bbox + zoom] Download --> FreshnessFilter{Freshness ok per AC-8.2 + AC-NEW-6?} FreshnessFilter -->|stale and stable_rear| RejectOrDowngrade[Reject or downgrade tile] FreshnessFilter -->|stale and active_conflict| RejectOrDowngrade - FreshnessFilter -->|fresh| Stage[C12 stages tiles + calibration on companion] - RejectOrDowngrade --> Stage - Stage --> InvokeC10[C12 invokes C10 CacheProvisioner on companion] - InvokeC10 --> WriteTiles[C10 writes tiles to filesystem + Postgres] - WriteTiles --> CompileEngines[C10 compiles TRT engines via C7 InferenceRuntime] + FreshnessFilter -->|fresh| ResolutionGate{Resolution >= 0.5 m/px per RESTRICT-SAT-4?} + RejectOrDowngrade --> ResolutionGate + ResolutionGate -->|fail| RejectRes[Reject and report] + ResolutionGate -->|pass| WriteTiles[C11 writes tiles to filesystem + Postgres] + WriteTiles --> InvokeC10[C12 invokes C10 build_cache_artifacts] + RejectRes --> Done + InvokeC10 --> CompileEngines[C10 compiles or reuses TRT engines via C7 InferenceRuntime] CompileEngines --> EngineCacheHit{EngineCacheEntry already valid for SM JP TRT precision tuple?} EngineCacheHit -->|yes D-C10-6| ReuseEngine[Reuse cached engine and INT8 calibration cache] EngineCacheHit -->|no| BuildEngine[Polygraphy or trtexec or IBuilderConfig hybrid build] @@ -113,26 +124,28 @@ flowchart TD | Step | From | To | Data | Format | |------|------|----|------|--------| | 1 | Operator | C12 | (`bounding_box`, `zoom_levels`, `sector_class`, `calibration_path`) | CLI args / GUI form | -| 2 | C12 | `satellite-provider` REST | `GET /api/satellite/tiles?bbox=…&zoom=…` | HTTPS query | -| 3 | `satellite-provider` | C12 | Paged tile blobs + metadata rows | JPEG + JSON metadata | -| 4 | C12 | C10 (over USB/Eth) | Staged tile bundle + calibration JSON | Tarball + manifest stub | -| 5 | C10 | C6 filesystem | Tile JPEG bodies | `./tiles/{zoomLevel}/{x}/{y}.jpg` | -| 6 | C10 | C6 PostgreSQL | Tile metadata rows | SQL INSERT (mirror of `satellite-provider`'s `tiles` table) | -| 7 | C10 → C7 | TRT engine cache | TRT engines | `.engine` files keyed by `(SM, JP, TRT, precision)` (D-C10-7) | -| 8 | C2 backbone | C6 FAISS index | Descriptor matrix | `.index` (FAISS HNSW), atomicwrites, SHA-256 sidecar | -| 9 | C10 | filesystem | Manifest | YAML or JSON; carries hashes | +| 2 | C12 | C11 `TileDownloader` | `DownloadRequest` | in-process call | +| 3 | C11 | `satellite-provider` REST | `GET /api/satellite/tiles?bbox=…&zoom=…` | HTTPS query | +| 4 | `satellite-provider` | C11 | Paged tile blobs + metadata rows | JPEG + JSON metadata | +| 5 | C11 | C6 filesystem (over USB/Eth) | Tile JPEG bodies | `./tiles/{zoomLevel}/{x}/{y}.jpg` | +| 6 | C11 | C6 PostgreSQL | Tile metadata rows (`source='googlemaps'`) | SQL INSERT (mirror of `satellite-provider`'s `tiles` table) | +| 7 | C12 | C10 `CacheProvisioner` | `BuildRequest` | in-process call (operator-tool side); RPC over USB/Eth to companion runner | +| 8 | C10 → C7 | TRT engine cache | TRT engines | `.engine` files keyed by `(SM, JP, TRT, precision)` (D-C10-7) | +| 9 | C2 backbone (driven by C10) | C6 FAISS index | Descriptor matrix | `.index` (FAISS HNSW), atomicwrites, SHA-256 sidecar | +| 10 | C10 | filesystem | Manifest | YAML or JSON; carries hashes | ### Error scenarios | Error | Where | Detection | Recovery | |-------|-------|-----------|----------| -| `satellite-provider` unreachable | Step 2 | HTTP timeout / 5xx | C12 fails with explicit error; operator retries when network is available; takeoff blocked | -| Tile fails freshness | Step 4 | `tile.capture_timestamp` vs `sector_class` threshold | Reject (active_conflict) or downgrade-no-`satellite_anchored`-label (rear), per AC-NEW-6; report to operator | -| Resolution below 0.5 m/px | Step 4 | Tile metadata GSD check (RESTRICT-SAT-4) | Reject; report; takeoff blocked | -| Insufficient cache budget | Step 5 | Filesystem free-space check pre-write | Fail fast with explicit budget delta; no partial write | -| Engine compile failure | Step 7 | Polygraphy / trtexec exit code; no output `.engine` | Surface error to operator; takeoff blocked; **never silently fall back** | -| Descriptor generation OOM on Jetson | Step 8 | CUDA OOM | Halve batch size and retry once; if still OOM, surface to operator | -| Atomic-write or SHA-256 mismatch | Step 8 | `atomicwrites` rollback or content-hash sidecar mismatch | Mark cache invalid; rebuild from staged tiles; if persistent, surface to operator | +| `satellite-provider` unreachable | Step 3 | HTTP timeout / 5xx | C11 `TileDownloader` fails with explicit error; operator retries when network is available; takeoff blocked | +| Tile fails freshness | Step 4 (C11) | `tile.capture_timestamp` vs `sector_class` threshold | Reject (active_conflict) or downgrade-no-`satellite_anchored`-label (rear), per AC-NEW-6; counts surface in `DownloadBatchReport` | +| Resolution below 0.5 m/px | Step 4 (C11) | Tile metadata GSD check (RESTRICT-SAT-4) | Reject; report; takeoff blocked | +| Insufficient cache budget | Step 5 (C11) | Filesystem free-space check pre-write | Fail fast with explicit budget delta; no partial write | +| C6 missing tiles for requested bbox/zoom | Step 7 (C10) | C10's pre-build scan finds < expected tile count | Surface as `BuildReport.failure` instructing operator to re-run C11 `TileDownloader`; do **not** trigger network fetch from C10 | +| Engine compile failure | Step 8 | Polygraphy / trtexec exit code; no output `.engine` | Surface error to operator; takeoff blocked; **never silently fall back** | +| Descriptor generation OOM on Jetson | Step 9 | CUDA OOM | Halve batch size and retry once; if still OOM, surface to operator | +| Atomic-write or SHA-256 mismatch | Step 9 | `atomicwrites` rollback or content-hash sidecar mismatch | Mark cache invalid; rebuild from staged tiles; if persistent, surface to operator | | Tampered cache (post-write, pre-takeoff) | (caught at takeoff in F2, not here) | F2 SHA-256 content-hash gate | F2 refuses takeoff (IT-7) | ### Performance expectations @@ -390,12 +403,12 @@ GTSAM iSAM2 with `IncrementalFixedLagSmoother` retroactively refines past keyfra ### Description -For every successful satellite-anchored frame whose `TileQualityMetadata` clears the publish threshold, orthorectify the nav frame onto basemap projection, deduplicate against the existing local tile cache, and write the result locally in `satellite-provider`-compatible on-disk format. **No outbound write while airborne** — process-level isolation enforces this: the C11 upload path is not loaded in the airborne companion image (ADR-004). The post-landing tool (F10) is a separate process / image. +For every successful satellite-anchored frame whose `TileQualityMetadata` clears the publish threshold, orthorectify the nav frame onto basemap projection, deduplicate against the existing local tile cache, and write the result locally in `satellite-provider`-compatible on-disk format. **No outbound network write while airborne** — process-level isolation enforces this: neither the C11 `TileUploader` nor the C11 `TileDownloader` is loaded in the airborne companion image (ADR-004). The post-landing tool (F10) is a separate process / image. ### Preconditions - F3 produced a `PoseEstimate` with provenance `satellite_anchored` and covariance below the publish threshold. -- `flight_state == IN_AIR` is signalled by FC `MAV_STATE`; the in-air image does not contain C11 (process-level isolation). +- `flight_state == IN_AIR` is signalled by FC `MAV_STATE`; the in-air image does not contain C11 (process-level isolation; both `TileDownloader` and `TileUploader` paths absent). - Local C6 tile store has free quota (per AC-NEW-3 FDR sub-budget allocation). - Mid-flight tile metadata schema (quality_metadata) is configured per AC-NEW-7 + D-PROJ-2 contract sketch. @@ -463,7 +476,7 @@ flowchart TD | Filesystem write fails | Step 3 | filesystem error | Skip tile; FDR logs error; pipeline continues (tile generation is best-effort, not safety-critical) | | Postgres insert fails | Step 4 | DB error | Skip tile; FDR logs error | | Local cache quota exhausted | Step 3 | pre-write free-space check | LRU-evict oldest **mid-flight** tile (never evict pre-flight `satellite-provider` tiles); FDR logs eviction | -| `flight_state` glitch reports `ON_GROUND` mid-flight | architectural | software guard — but C11 is not loaded anyway | Defense-in-depth holds: even if guard misfires, C11 binary is not present in the airborne image | +| `flight_state` glitch reports `ON_GROUND` mid-flight | architectural | software guard — but C11 is not loaded anyway | Defense-in-depth holds: even if guard misfires, C11 (both `TileDownloader` and `TileUploader`) is not present in the airborne image | | Dedup race (two threads writing same cell) | Step 4 | DB unique constraint or filesystem `O_EXCL` | Retry once with the freshest candidate; FDR logs race | ### Performance expectations @@ -902,26 +915,26 @@ sequenceDiagram ### Description -After the UAV has landed and `flight_state == ON_GROUND` is confirmed, the operator triggers C11 (a separate operator-side process / image — **not present in the airborne companion image**, ADR-004) which reads locally-saved mid-flight tiles from C6 and uploads them to `satellite-provider`'s ingest endpoint per the D-PROJ-2 contract sketch. Each tile carries quality metadata sufficient for the parent-suite voting layer to decide promotion `pending → trusted` (D-PROJ-2 design task #2; not yet implemented service-side — `mock-suite-sat-service` stands in for testing). +After the UAV has landed and `flight_state == ON_GROUND` is confirmed, the operator triggers the C11 Tile Manager's `TileUploader` (a separate operator-side process / image — **not present in the airborne companion image**, ADR-004) which reads locally-saved mid-flight tiles from C6 and uploads them to `satellite-provider`'s ingest endpoint per the D-PROJ-2 contract sketch. Each tile carries quality metadata sufficient for the parent-suite voting layer to decide promotion `pending → trusted` (D-PROJ-2 design task #2; not yet implemented service-side). Until the real endpoint ships, integration tests target the e2e-test `mock-suite-sat-service` fixture under `tests/fixtures/`; production never reaches the fixture. ### Preconditions - `flight_state == ON_GROUND` confirmed by the FC's `MAV_STATE` (operator's workstation reads this off the FC or from the FDR). -- Operator workstation has network reach to `satellite-provider` (or `mock-suite-sat-service` in test). +- Operator workstation has network reach to `satellite-provider` (in tests, the e2e `mock-suite-sat-service` fixture stands in for the not-yet-shipped POST endpoint). - Local C6 tile store has mid-flight tiles with `voting_status=pending` and quality metadata. -- Per-flight onboard signing key (generated at takeoff load, baked into tile metadata) is available to C11 for payload signing. +- Per-flight onboard signing key (generated at takeoff load, baked into tile metadata) is available to C11 `TileUploader` for payload signing. ### Sequence Diagram ```mermaid sequenceDiagram participant Operator - participant C11 as C11 PostLandingUploadTool (workstation) + participant C11 as C11 TileUploader (workstation) participant C6 as C6 TileStore (companion or workstation mirror) participant SatelliteProvider as [[satellite-provider]] (D-PROJ-2 endpoint, planned) participant Fdr as C13 FdrWriter - Operator->>C11: trigger upload(flight_id) + Operator->>C11: upload_pending_tiles(flight_id) C11->>C6: read mid-flight tiles where voting_status=pending AND flight_id=... C6-->>C11: batched Tile + TileQualityMetadata loop per batch @@ -930,7 +943,7 @@ sequenceDiagram C11->>C6: update voting_status=uploaded for accepted tiles C11->>Fdr: upload-batch event + service response end - C11-->>Operator: upload report (counts, rejections, duplicates) + C11-->>Operator: UploadBatchReport (counts, rejections, duplicates) Note over SatelliteProvider: voting layer (D-PROJ-2 design task #2) eventually promotes pending → trusted; out of scope for this flow ``` @@ -938,7 +951,7 @@ sequenceDiagram ```mermaid flowchart TD - Start([Operator triggers C11 with flight_id]) --> StateCheck{flight_state == ON_GROUND confirmed?} + Start([Operator triggers C11 TileUploader with flight_id]) --> StateCheck{flight_state == ON_GROUND confirmed?} StateCheck -->|no| Refuse[Refuse to upload; report to operator] StateCheck -->|yes| ReadTiles[Read mid-flight tiles voting_status=pending] ReadTiles --> Empty{Any tiles to upload?} diff --git a/_docs/02_document/tests/traceability-matrix.md b/_docs/02_document/tests/traceability-matrix.md index 1871662..a37ae34 100644 --- a/_docs/02_document/tests/traceability-matrix.md +++ b/_docs/02_document/tests/traceability-matrix.md @@ -103,7 +103,7 @@ Coverage clears the 75% gate with margin under both the inclusive reading (PARTI These insights from Phase 2 augment the F1-F5 carried over from Phase 1; together they feed forward into Solution Analysis (Step 2) and Component Decomposition (Step 3): 1. **F6 — Two-tier execution profile is a first-class architectural concern.** The split between Tier-1 (workstation Docker) and Tier-2 (Jetson hardware) means several AC have validation locations that must appear in the deployment plan and in the CI matrix design. Add a "Tier-2 hardware-runner availability" entry to the project's risk register (Step 4). -2. **F7 — `mock-suite-sat-service` is a real testing-time dependency that must be documented as a component boundary (not just a test fixture).** It encodes the publish-side of D-PROJ-2 and feeds into both NFT-SEC-01 and FT-P-17. Component decomposition (Step 3) should treat the Service-publish contract as an explicit C8/C10 cross-cutting boundary, not buried inside C8. +2. **F7 — `mock-suite-sat-service` is an e2e-test fixture for the not-yet-shipped D-PROJ-2 POST contract.** It is **not a first-class component** (ADR-007 reversed 2026-05-09); the architectural counterparty for both download and upload is the real `satellite-provider`. The contract sketch is the source of truth and lives in `_docs/_process_leftovers/2026-05-09_satellite-provider-design-tasks.md`; the fixture mirrors it for NFT-SEC-01 / FT-P-17 / IT runs and is retired when the real endpoint ships. Component decomposition (Step 3) treats the Service-publish contract as a C11 `TileUploader` ↔ `satellite-provider` boundary (not buried inside C8). 3. **F8 — VioStrategy parameterization in CI requires both a production binary AND a research binary.** D-C1-1-SUB-A locked the BUILD_VINS_MONO=ON/OFF split; the test plan must produce both binaries on every PR for the comparative-study report (IT-12 in `solution.md`). Add to deployment plan (Step 2) and to epic/work-item planning (Step 6). 4. **F9 — D-PROJ-3 (fixture acquisition) is now a named deliverable** with a clear gate: must resolve before greenfield Step 5 re-runs the full test-spec with architecture context. Promote to risk register and to the architecture's open-items list. 5. **F10 — Defense-in-depth security layer (NFT-SEC-05 DNS blackholing, OPENCV ASan build, SBOM signing-passkey verification)** implies CI/build infrastructure features (multi-stage build for ASan instrumentation, SBOM generator, lockfile linter). Add to deployment plan (Step 2). diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 6670051..7c22186 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -6,8 +6,8 @@ step: 3 name: Plan status: in_progress sub_step: - phase: 10 - name: plan-step2-phase2a-blocking-review - detail: "ADR-002 reframed to technical-only build-time exclusion; license-driven framing removed; packaging optionality NOTE added" + phase: 21 + name: plan-step6-epics-local-draft + detail: "Step 5 complete; Step 6 epics.md drafted locally (18 epics, 2054 lines, chunked write per new large-file-writes rule); awaiting user decision on Jira push" retry_count: 0 cycle: 1