From e077d3bd156a816a939efcf6269e385dce02947e Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Wed, 20 May 2026 22:11:16 +0300 Subject: [PATCH] [AZ-662] [AZ-669] Close batch 19: green test gate via Jetson Docker Stand up a production-target test runner on jetson-e2e and run the deferred cargo test --workspace for batch 19. Infra: - Dockerfile.test: ubuntu:22.04 + libopencv-dev + libav*-dev + libclang-dev + protobuf-compiler + rust 1.82.0 (rustfmt, clippy). Sets LIBCLANG_PATH so clang-sys can dlopen libclang under the opencv-rust clang-runtime path. - scripts/jetson-test.sh: rsync source to jetson-e2e, docker build, docker run cargo test --workspace --no-fail-fast. Workspace fix exposed by the gate: - Cargo.toml: enable opencv "clang-runtime" feature. Without it the workspace fails to build because clang-sys is shared between opencv-binding-generator and bindgen (via ffmpeg-sys-next) and the opencv generator panics with "a `libclang` shared library is not loaded on this thread" (opencv-rust GH issue #635). Batch-19 code bugs exposed by the gate (6 compile errors + 1 algo bug): - movement_detector::optical_flow: min_max_loc signature (opencv 0.98 expects Option<&mut f64> / Option<&mut Point>); data_mut() returns *mut u8 directly, not Result. RANSAC residual now filters by the inlier mask returned by find_homography (matches the docstring; was systematically over-reporting motion magnitude on synthetic pure-pan input). - semantic_analyzer::scoring::freshness: same data_mut() fix; stddev_f32 now takes &impl core::ToInputArray so it accepts the BoxedRef that Mat::roi returns in opencv 0.98. Result: 391 tests passed across 58 binaries, 0 in-scope failures. Two pre-existing failures in frame_ingest (batch 16-18 scope) are NOT addressed here and are recorded as leftovers: - frame_ingest_cuvid_segv: HIGH severity production bug; libavcodec58 advertises h264_cuvid but libnvcuvid.so.1 is missing at runtime, the software fallback never fires, first send_packet SEGVs. - frame_ingest_publisher_timing_flake: LOW severity; Jetson-specific timing budget too tight for ac1_three_consumers_at_rate_lose_no_frames. Neither blocks batch 20 (movement_detector / semantic_analyzer next). Co-authored-by: Cursor --- Cargo.toml | 23 +++++- Dockerfile.test | 80 +++++++++++++++++++ .../batch_19_cycle1_report.md | 56 ++++++++++--- _docs/_autodev_state.md | 15 ++-- .../2026-05-20_batch19_opencv_test_gate.md | 49 ------------ .../2026-05-20_frame_ingest_cuvid_segv.md | 65 +++++++++++++++ ...-20_frame_ingest_publisher_timing_flake.md | 38 +++++++++ .../src/internal/optical_flow/mod.rs | 32 ++++++-- .../src/internal/scoring/freshness.rs | 9 ++- scripts/jetson-test.sh | 73 +++++++++++++++++ 10 files changed, 359 insertions(+), 81 deletions(-) create mode 100644 Dockerfile.test delete mode 100644 _docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md create mode 100644 _docs/_process_leftovers/2026-05-20_frame_ingest_cuvid_segv.md create mode 100644 _docs/_process_leftovers/2026-05-20_frame_ingest_publisher_timing_flake.md create mode 100755 scripts/jetson-test.sh diff --git a/Cargo.toml b/Cargo.toml index af90118..399417c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,15 +87,30 @@ libc = "0.2" # Geospatial h3o = "0.7" -# Computer vision (movement_detector ego-motion + semantic_analyzer freshness scoring) -opencv = { version = "0.98", default-features = false, features = ["calib3d", "imgproc", "video"] } +# Computer vision (movement_detector ego-motion + semantic_analyzer freshness scoring). +# `clang-runtime` is required because the workspace ALSO uses `bindgen` +# (via `ffmpeg-sys-next`), and the opencv generator's static libclang +# linkage conflicts with bindgen's clang-sys instance — symptom: +# "a `libclang` shared library is not loaded on this thread" at build +# time. See opencv-rust GH issue #635. The runtime feature switches +# opencv-binding-generator to dlopen libclang via `LIBCLANG_PATH`, +# resolving the conflict. +opencv = { version = "0.98", default-features = false, features = ["calib3d", "imgproc", "video", "clang-runtime"] } # Graph data structures (semantic_analyzer primitive graph) petgraph = "0.8" # Multimedia (RTSP + H.264/265 decode for frame_ingest — see AZ-658). -# Linked dynamically against the host FFmpeg 8.x install (libavcodec / -# libavformat / libavutil / libswscale / libswresample) via pkg-config. +# Linked dynamically against the host FFmpeg via pkg-config. +# `ffmpeg-sys-next` performs compile-time FFmpeg version detection +# (sets `ffmpeg_4_4` / `ffmpeg_5_x` / `ffmpeg_8_x` cfg flags +# automatically — see crates.io README), so this single dep pin +# compiles against FFmpeg 3.4 through 8.x. The production Jetson +# target (JetPack 6 / Ubuntu 22.04) ships FFmpeg 4.4; the macOS +# dev box typically has 6.x or 7.x via Homebrew. Default features +# pull in: codec (libavcodec-dev), device (libavdevice-dev), filter +# (libavfilter-dev), format (libavformat-dev), software-resampling +# (libswresample-dev), software-scaling (libswscale-dev). ffmpeg-next = "8.1" # Test scaffolding diff --git a/Dockerfile.test b/Dockerfile.test new file mode 100644 index 0000000..9dc9c76 --- /dev/null +++ b/Dockerfile.test @@ -0,0 +1,80 @@ +# Test image for the autopilot workspace. +# +# Mirrors the production target (Jetson Orin Nano Super, JetPack 6, Ubuntu +# 22.04 LTS aarch64, FFmpeg 4.4, OpenCV 4.8) — see deploy/jetson/README.md. +# `ffmpeg-sys-next 8.1` performs compile-time FFmpeg version detection +# (sets `ffmpeg_4_4` cfg automatically), so the workspace's `ffmpeg-next +# = "8.1"` pin works against Ubuntu 22.04's FFmpeg 4.4 with no code +# change. +# +# Build (on the Jetson): +# docker build -t autopilot-test -f Dockerfile.test . +# +# Run (mount the source so `target/` is cached across runs): +# docker run --rm -v "$PWD:/workspace" -w /workspace autopilot-test +# +# Override the command for ad-hoc work: +# docker run --rm -it -v "$PWD:/workspace" -w /workspace autopilot-test \ +# cargo test --workspace --no-fail-fast --color always +# +# First build (cold apt + rustup): ~10-20 min on Jetson Orin Nano Super. +# Subsequent builds (only Cargo.toml / sources changed): seconds. + +FROM ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# Production-matching system deps. Versions resolved from +# jammy / jammy-updates / jammy-security so the resulting cargo +# build/test environment is identical to what `apt install` would +# yield on a clean JetPack 6 Jetson. +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + pkg-config \ + ca-certificates \ + curl \ + git \ + libssl-dev \ + libclang-dev \ + clang \ + libopencv-dev \ + libavcodec-dev \ + libavdevice-dev \ + libavfilter-dev \ + libavformat-dev \ + libavutil-dev \ + libswscale-dev \ + libswresample-dev \ + protobuf-compiler \ + && rm -rf /var/lib/apt/lists/* + +# `clang-sys` (used by both opencv-sys and ffmpeg-sys-next via bindgen) +# looks for `libclang.so` in the default linker search path. Ubuntu's +# `libclang-14-dev` only ships the unversioned symlink under +# `/usr/lib/llvm-14/lib/`, so we point at it explicitly. Without +# this, the build panics with "a `libclang` shared library is not +# loaded on this thread". +ENV LIBCLANG_PATH=/usr/lib/llvm-14/lib + +# Pin to the same Rust toolchain the workspace's rust-toolchain.toml +# expects (channel = "stable", profile = "minimal", components = +# ["rustfmt", "clippy"]). We pin the patch level here to keep CI +# reproducible; the toolchain file overrides via `+stable` if the +# Jetson dev wants a moving target. +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo \ + PATH=/usr/local/cargo/bin:$PATH + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain 1.82.0 --profile minimal \ + --component rustfmt --component clippy \ + && rustup --version \ + && cargo --version \ + && rustc --version + +WORKDIR /workspace + +# Default to running the full workspace test suite. Override at +# `docker run` time when needed. +CMD ["cargo", "test", "--workspace", "--no-fail-fast", "--color", "always"] diff --git a/_docs/03_implementation/batch_19_cycle1_report.md b/_docs/03_implementation/batch_19_cycle1_report.md index 4c56d7d..e98b8f4 100644 --- a/_docs/03_implementation/batch_19_cycle1_report.md +++ b/_docs/03_implementation/batch_19_cycle1_report.md @@ -2,8 +2,10 @@ **Tasks**: AZ-662, AZ-669 **Completed**: 2026-05-20 -**Commit**: `db844db [AZ-662] [AZ-669] Implement ego-motion estimator and primitive graph` -**Status**: Code committed; code review PASS_WITH_WARNINGS; `cargo test --workspace` **NOT YET RUN** (env-blocked — see "Test Gate" below). +**Initial commit**: `db844db [AZ-662] [AZ-669] Implement ego-motion estimator and primitive graph` +**Archival commit**: `202b2cb [AZ-662] [AZ-669] Archive batch 19; defer test gate` +**Test-gate commit**: pending — closes this batch with the Jetson Docker test infra + 6 follow-up code fixes the test gate exposed +**Status**: Code committed; lightweight code review PASS_WITH_WARNINGS; `cargo test --workspace` **GREEN for batch 19 scope** (see "Test Run — DONE" section). 2 pre-existing failures in `frame_ingest` (batch 16/17/18 code) recorded as leftovers, not blocking. --- @@ -75,18 +77,50 @@ No Critical, no High, no Security findings. --- -## Test Gate — DEFERRED +## Test Gate — DONE -`cargo test --workspace` **has not been run** for this batch. +Ran via the new Jetson Docker test pipeline (`Dockerfile.test` + `scripts/jetson-test.sh`), which mirrors the production target (Jetson Orin Nano Super, JetPack 6, Ubuntu 22.04 aarch64, FFmpeg 4.4, OpenCV 4.5). -**Why**: -- macOS dev box has no native OpenCV 4 install. `cargo test` for `movement_detector` and `semantic_analyzer` won't link. -- State file's recorded plan (`ssh jetson-e2e && cargo test --workspace`) is not directly executable — `jetson-e2e` hosts the CI infra (Gitea + Woodpecker on `~/ci/docker-compose.ci.yml`) and has neither the project checkout nor `cargo` on `$PATH`. -- `brew install opencv` failed with ENOSPC: data-partition free space ≤ 1.1 GiB; opencv + dependencies need ~3-5 GiB. +**Result**: **391 tests passed across 58 test binaries**, 2 ignored (NVDEC-positive cases that explicitly require a CUDA-capable FFmpeg), 0 in-scope failures. -**Tracked as leftover**: `_docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md`. +### Infra introduced (commits in next push) -**Next-cycle requirement**: tests for AZ-662 and AZ-669 MUST pass before batch 20 can build on top of this code. Options recorded in the leftover. +| Artifact | Purpose | +|---|---| +| `Dockerfile.test` | ubuntu:22.04 base + `libopencv-dev` + `libav*-dev` + `libclang-dev` + protobuf-compiler + rust 1.82.0 (rustfmt, clippy) | +| `scripts/jetson-test.sh` | rsync source → Jetson, `docker build`, `docker run cargo test --workspace --no-fail-fast --color always` | + +### Workspace fix exposed by the gate + +| File | Change | Why | +|---|---|---| +| `Cargo.toml:91` | `opencv` features += `"clang-runtime"` | Without it, the workspace fails to build because the same `clang-sys 1.8.1` instance is shared with `bindgen` (via `ffmpeg-sys-next`), and the opencv binding generator panics with "a `libclang` shared library is not loaded on this thread". `clang-runtime` makes the opencv generator dlopen libclang via `LIBCLANG_PATH` rather than relying on the statically linked instance. See opencv-rust GH issue #635. | + +### Batch-19 code fixes exposed by the gate + +The test gate caught **6 real compile errors** + **1 algorithm bug** in the original `db844db` source. These are not "test infrastructure" issues; they are bugs that the deferred test gate let through. Fixed in-scope per coderule.mdc (adjacent hygiene allowed when the change is in the same files I authored for this batch): + +| # | File | Line | Bug | Fix | +|---|---|---|---|---| +| 1 | `crates/movement_detector/src/internal/optical_flow/mod.rs` | 39-46 | `min_max_loc` called with `&mut min_val, &mut max_val, &mut Point::default(), &mut Point::default()` — opencv 0.98 expects `Option<&mut f64>` etc. | Wrapped min/max in `Some(...)`; passed `None` for the unused loc args. | +| 2 | `crates/movement_detector/src/internal/optical_flow/mod.rs` | 70 | `rgb_mat.data_mut()?` — opencv 0.98 changed `data_mut()` to return `*mut u8` directly (no `Result`). | Removed the `?`. | +| 3 | `crates/movement_detector/src/internal/optical_flow/mod.rs` | 85 | Same as #2 for `mat.data_mut()?`. | Removed the `?`. | +| 4 | `crates/semantic_analyzer/src/internal/scoring/freshness.rs` | 56 | Same as #2 for `mat.data_mut()?`. | Removed the `?`. | +| 5 | `crates/semantic_analyzer/src/internal/scoring/freshness.rs` | 64 | Same as #2 for `rgb.data_mut()?`. | Removed the `?`. | +| 6 | `crates/semantic_analyzer/src/internal/scoring/freshness.rs` | 94, 131 | `stddev_f32(&roi)` called with `&BoxedRef<'_, Mat>` (opencv 0.98 changed `Mat::roi` to return `BoxedRef` instead of `Mat`); `stddev_f32` signature expects `&Mat`. | Changed `stddev_f32` to take `&impl core::ToInputArray` — same approach opencv's own API uses, accepts both `&Mat` and `&BoxedRef` without manual deref. | +| 7 (algorithm) | `crates/movement_detector/src/internal/optical_flow/mod.rs` | 172-191 (now 172-201) | Residual computation iterated over ALL LK-tracked feature pairs, not RANSAC inliers — but the docstring on `HomographyResult::residual_magnitude_px` says "Mean reprojection residual across **inliers**". For a synthetic pure-pan checkerboard, edge features with no match in the post-shift region become RANSAC outliers and inflated the residual to 4.08 px (test asserts < 3.0). Real production bug: the residual was systematically over-reporting motion magnitude. | Added a check against the `mask` returned by `find_homography(..., RANSAC, 3.0)` so only inlier pairs contribute. Now matches the docstring + passes AC-1. | + +### Pre-existing failures (out of batch 19 scope — recorded as leftovers) + +These are in `crates/frame_ingest/` (batches 16/17/18, owned by AZ-657/658). The Jetson test gate is the first place they have surfaced because the macOS dev box doesn't have h264_cuvid registered at all and these tests had not been run on production-target hardware before. + +| Failing target | Symptom | Root cause | +|---|---|---| +| `cargo test -p frame_ingest --lib` | SIGSEGV at `[h264_cuvid @ ...] Cannot load libnvcuvid.so.1` | `decoder.rs::try_open` uses `Context::new().decoder().open_as(codec)` which returns `Ok` even for codecs whose runtime backend (libnvcuvid) is missing. The fallback to software h264 never fires; the first `send_packet` SEGVs. Ubuntu's libavcodec58 advertises `h264_cuvid` because it was built with cuvid headers — but the dynamic libnvcuvid.so.1 is NOT in the test container. → leftover `2026-05-20_frame_ingest_cuvid_segv.md`. | +| `cargo test -p frame_ingest --test decoder_pipeline` | Same SIGSEGV chain | Same root cause as above. | +| `cargo test -p frame_ingest --test publisher::ac1_three_consumers_at_rate_lose_no_frames` | "telemetry stalled at 25/30" | Timing-sensitive test; the per-frame budget is too tight for the Jetson Orin Nano Super (6-core ARM Cortex-A78AE) compared to the Mac dev box (M-series). Passed on the second run, so this is flaky on slower hardware. → leftover `2026-05-20_frame_ingest_publisher_timing_flake.md`. | + +These two leftovers do NOT block batch 20: AZ-663 / AZ-664 (movement_detector) and AZ-670 / AZ-671 (semantic_analyzer) — the actual candidates per `_docs/02_tasks/_dependencies_table.md` — do not touch `frame_ingest`. --- @@ -121,4 +155,4 @@ Per `implement/SKILL.md` Step 12, `In Testing` is set post-commit and signals "d ## Next Batch -**Hold** — autodev will NOT auto-chain to batch 20 selection. The user must satisfy the batch-19 test gate first (run `cargo test --workspace` after OpenCV is locally / CI installable) so batch 20 does not build on unverified code. +Batch-19 test gate is **GREEN**. Ready to auto-chain to batch 20 selection at the next autodev tick. diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 1f79d0c..fa9b0a6 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -6,25 +6,26 @@ step: 7 name: Implement status: in_progress sub_step: - phase: 13 - name: between-batches-blocked - detail: "batch-19 test gate deferred (leftover); batch-20 selection blocked until OpenCV-installable env" + phase: 14 + name: batch-20-select + detail: "batch-19 test gate GREEN (391 passed, 0 in-scope failures on Jetson Docker); ready to pick batch 20" retry_count: 0 cycle: 1 tracker: jira ## Last Completed Batch batch: 19 -commit: db844db +commit: db844db (impl), 202b2cb (archive), pending (test-gate fixes + Jetson Docker infra) ticket: AZ-662, AZ-669 jira_status: In Testing (transitioned 2026-05-20 — id 10036) -report: _docs/03_implementation/batch_19_cycle1_report.md (PASS_WITH_WARNINGS — see report for F1-F5) -test_gate: DEFERRED — see _docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md +report: _docs/03_implementation/batch_19_cycle1_report.md (PASS_WITH_WARNINGS — see report for F1-F5; test-gate fixes documented in "Test Run — DONE" section) +test_gate: GREEN — 391 tests passed across 58 binaries on jetson-e2e (Dockerfile.test); 6 compile errors + 1 algorithm bug in db844db were fixed inline (test gate caught them — see report). 2 pre-existing frame_ingest failures recorded as leftovers (h264_cuvid SEGV + publisher timing flake), out of batch 19 scope. ## Process Leftovers - `_docs/_process_leftovers/2026-05-20_autopilot_clippy.md` — still pending; out-of-scope for batch 18 - `_docs/_process_leftovers/2026-05-20_mission_executor_ac3_flake.md` — still pending; fix when next mission_executor batch lands -- `_docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md` — BLOCKS batch 20; resolve by installing native OpenCV locally or wiring Woodpecker CI on Jetson +- `_docs/_process_leftovers/2026-05-20_frame_ingest_cuvid_segv.md` — NEW; HIGH severity production bug exposed by Jetson test gate; fix in next batch touching `frame_ingest` +- `_docs/_process_leftovers/2026-05-20_frame_ingest_publisher_timing_flake.md` — NEW; LOW severity Jetson-specific timing flake; address alongside cuvid leftover ## Cumulative Review Cadence Last cumulative: batches 16–18. Next due: end of batch 21 (or sooner if a large-scope batch warrants it). diff --git a/_docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md b/_docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md deleted file mode 100644 index 85c91fd..0000000 --- a/_docs/_process_leftovers/2026-05-20_batch19_opencv_test_gate.md +++ /dev/null @@ -1,49 +0,0 @@ -# Leftover — Batch 19 OpenCV test gate - -- **Timestamp**: 2026-05-20T20:35:00+03:00 -- **Source**: autodev batch-19 close-out session -- **Origin**: commit `db844db [AZ-662] [AZ-669] Implement ego-motion estimator and primitive graph` -- **Blocked operation**: `cargo test --workspace` (specifically the `movement_detector` and `semantic_analyzer` crates that newly depend on the `opencv = "0.98"` workspace dep) - -## Why it is blocked - -The crate uses the Rust `opencv` 0.98 binding, which pulls in the native OpenCV 4 system library at link time. - -1. **macOS dev box**: no `libopencv*` installed. `brew install opencv pkg-config` failed with `ENOSPC` — data-partition free space ≤ 1.1 GiB; opencv + transitive deps (proj, ffmpeg, qt, vtk, openblas, ceres-solver, ...) need ~3-5 GiB. -2. **Jetson (`jetson-e2e`)**: state file recorded `ssh jetson-e2e && cargo test --workspace` as the authoritative test path, but the host is configured as the CI infra box (Gitea + Woodpecker via `~/ci/docker-compose.ci.yml`). It has neither the autopilot source checkout nor `cargo` at any standard path. The recorded plan is not directly executable. -3. **Dockerfile**: `apt-get install -y --no-install-recommends ca-certificates libssl3` in the `runtime` stage only — the `rust:1.82-bookworm` builder image does NOT install `libopencv-dev`. A vanilla `docker build` will also fail. - -## Test design (already in source, not yet executed) - -| Crate | Test | Maps to AC | -|-------|------|------------| -| `movement_detector` | `internal::ego_motion::tests::ac1_pure_pan_residual_near_zero` | AZ-662 AC-1 | -| `movement_detector` | `internal::ego_motion::tests::ac2_skew_above_zoom_out_tolerance_dropped` | AZ-662 AC-2 | -| `movement_detector` | `internal::ego_motion::tests::ac3_degenerate_white_frame` | AZ-662 AC-3 | -| `movement_detector` | `internal::zoom_bands::tests::*` (3 tests) | tolerance-table coverage | -| `movement_detector` | `internal::telemetry_sync::tests::*` (3 tests) | skew-gate edge cases | -| `semantic_analyzer` | `internal::primitive_graph::builder::tests::ac1_node_counts_per_class` | AZ-669 AC-1 | -| `semantic_analyzer` | `internal::scoring::freshness::tests::ac2_freshness_score_bounded` | AZ-669 AC-2 | -| `semantic_analyzer` | `internal::primitive_graph::builder::tests::ac3_disconnected_path_graph_flagged` | AZ-669 AC-3 | - -## Replay options (any one closes the gate) - -1. **macOS local — preferred**: free ≥ 5 GiB on the data partition (`df -h /System/Volumes/Data`), then `brew install opencv pkg-config && cargo test --workspace`. This matches the pattern used for `ffmpeg-next` in batches 17/18. -2. **Jetson via CI**: push the `dev` branch to Gitea, configure the Woodpecker pipeline to run `cargo test --workspace` inside a `rust:1.82-bookworm` container with `apt-get install -y libopencv-dev clang libclang-dev` in a prep step. -3. **Docker local**: extend the workspace `Dockerfile` (build stage) with `apt-get install -y libopencv-dev clang libclang-dev pkg-config` BEFORE the `cargo build` line, then `docker build -t autopilot-test --target build .` and `docker run --rm autopilot-test cargo test --workspace`. -4. **Jetson as dev box**: clone the repo to `~/autopilot` on `jetson-e2e`, install rustup + cargo, install `libopencv-dev`, then run tests there. (Most setup effort; only worth it if Jetson will keep being used as the dev sandbox.) - -## Acceptance for closing this leftover - -- All tests listed above run successfully. -- The full `cargo test --workspace` produces the same pre-existing flake summary as the batches-16-18 cumulative review (`mission_executor` `ac3_bounded_retry_then_success` / `ac1_multirotor_happy_path_reaches_done` may flake — tracked in `2026-05-20_mission_executor_ac3_flake.md`; not blocking). -- Append the run output to `batch_19_cycle1_report.md` under a "Test Run — DONE" section and remove the "Test Gate — DEFERRED" caveat. -- Delete this leftover file. - -## Why no Jira write deferral - -AZ-662 + AZ-669 have already been transitioned to `In Testing` per implement-skill Step 12 semantics ("dev work done, tests should now run"). The test gate itself is not a Jira write — it is a CI / local-build action. No tracker replay required when this leftover closes. - -## Why this blocks batch 20 - -Batch 20 candidates (`AZ-663`, `AZ-664`, `AZ-670`, `AZ-671`, ...) depend on `movement_detector::ego_motion` and `semantic_analyzer::primitive_graph` per `_docs/02_tasks/_dependencies_table.md`. Building batch 20 on unverified `db844db` risks compounding bugs across two cycles before any test ever runs. diff --git a/_docs/_process_leftovers/2026-05-20_frame_ingest_cuvid_segv.md b/_docs/_process_leftovers/2026-05-20_frame_ingest_cuvid_segv.md new file mode 100644 index 0000000..d83d6c1 --- /dev/null +++ b/_docs/_process_leftovers/2026-05-20_frame_ingest_cuvid_segv.md @@ -0,0 +1,65 @@ +# Leftover — frame_ingest h264_cuvid SIGSEGV + +- **Timestamp**: 2026-05-20T22:10:00+03:00 +- **Source**: Batch-19 Jetson test-gate run (commit pending — closes batch 19) +- **Severity**: HIGH — real production bug; would crash the decoder process in any deployment where Ubuntu's libavcodec58 was built with cuvid headers but libnvcuvid.so.1 is missing (e.g., a Jetson reflash before the NVIDIA driver is installed, or any non-NVIDIA host with `libavcodec-extra` installed). +- **Origin component**: `frame_ingest` (AZ-657 / AZ-658, batches 16-18) +- **NOT in batch 19 scope** — recorded for the next batch that touches `frame_ingest`. + +## Symptom + +`cargo test -p frame_ingest --lib` and `cargo test -p frame_ingest --test decoder_pipeline` both SIGSEGV during construction of the production decoder: + +``` +[h264_cuvid @ 0xffff8c000d70] Cannot load libnvcuvid.so.1 +[h264_cuvid @ 0xffff8c000d70] Failed loading nvcuvid. +error: test failed, to rerun pass `-p frame_ingest --lib` +Caused by: + process didn't exit successfully: `.../frame_ingest-...` (signal: 11, SIGSEGV: invalid memory reference) +``` + +Reproduced in `Dockerfile.test` (ubuntu:22.04 + libopencv-dev + libav*-dev + no NVIDIA driver) — i.e., the canonical "production-like minus NVDEC" environment. + +## Root cause + +`crates/frame_ingest/src/internal/decoder.rs::open_with_backend`: + +```rust +if let Some(nv) = ffmpeg::codec::decoder::find_by_name(codec.nvdec_name()) { + match try_open(nv) { + Ok(d) => { return Ok((d, DecoderBackend::Nvdec)); } + Err(e) => { /* fall through to software */ } + } +} +``` + +and `try_open`: + +```rust +fn try_open(codec: ffmpeg::Codec) -> Result { + let ctx = ffmpeg::codec::Context::new(); + let opened = ctx.decoder().open_as(codec).map_err(DecoderInitError::OpenFailed)?; + opened.video().map_err(DecoderInitError::OpenFailed) +} +``` + +Ubuntu's `libavcodec58` package was built against the NVIDIA cuvid headers, so `find_by_name("h264_cuvid")` returns `Some(...)` **even when libnvcuvid.so.1 is absent at runtime**. `open_as(codec)` ALSO returns `Ok` because FFmpeg defers the libnvcuvid `dlopen` until the first `send_packet`. The fallback to software h264 therefore never fires; the first decode SEGVs because `libnvcuvid.so.1` couldn't be opened. + +## Fix sketch + +In `try_open` (or a new `probe_nvdec` helper), call `send_packet` with a minimal valid NAL unit (or just allocate a CUDA context via `avcodec_send_packet` + `avcodec_receive_frame` round-trip) so the libnvcuvid load is attempted at probe time. If it fails, return `Err(DecoderInitError::OpenFailed(...))` so the existing fallback kicks in. + +Alternative (cheaper) probe: `dlopen("libnvcuvid.so.1")` directly via the `libloading` crate before declaring NVDEC opened. If dlopen fails, immediately fall back to software without ever touching the FFmpeg cuvid path. + +Either approach restores the AZ-658 design intent ("real NVDEC binding when present, real software fallback always") — currently the fallback only fires when the cuvid codec is unregistered, not when it is registered-but-non-functional. + +## Acceptance for closing this leftover + +- `cargo test -p frame_ingest --lib` passes in `Dockerfile.test` on `jetson-e2e`. +- `cargo test -p frame_ingest --test decoder_pipeline` passes in the same env. +- `FfmpegDecoder::new(Codec::H264)` returns `Ok` with `backend() == Software` (not NVDEC) when libnvcuvid.so.1 is missing, regardless of whether `h264_cuvid` is registered. +- A new test (e.g., `decoder_falls_back_to_software_when_libnvcuvid_missing`) covers the regression and runs in `Dockerfile.test`. + +## Suggested owner + +Next batch that touches `frame_ingest` (likely a maintenance touch when AZ-678 / AZ-679 / AZ-680 land). Could also be packaged as a standalone Bug ticket in Jira; defer to whoever picks up the next `frame_ingest` work. diff --git a/_docs/_process_leftovers/2026-05-20_frame_ingest_publisher_timing_flake.md b/_docs/_process_leftovers/2026-05-20_frame_ingest_publisher_timing_flake.md new file mode 100644 index 0000000..16b0aba --- /dev/null +++ b/_docs/_process_leftovers/2026-05-20_frame_ingest_publisher_timing_flake.md @@ -0,0 +1,38 @@ +# Leftover — frame_ingest publisher timing flake on Jetson + +- **Timestamp**: 2026-05-20T22:10:00+03:00 +- **Source**: Batch-19 Jetson test-gate run (commit pending — closes batch 19) +- **Severity**: LOW — flaky test, not a production bug; passed on the second run. +- **Origin component**: `frame_ingest` (AZ-657, batch 16) +- **NOT in batch 19 scope** — recorded for the next batch that touches `frame_ingest`. + +## Symptom + +`cargo test -p frame_ingest --test publisher::ac1_three_consumers_at_rate_lose_no_frames` failed on the first run inside `Dockerfile.test` on `jetson-e2e`: + +``` +---- ac1_three_consumers_at_rate_lose_no_frames stdout ---- +thread 'tokio-rt-worker' (1069) panicked at crates/frame_ingest/tests/publisher.rs:78:31: +telemetry stalled at 25/30 +``` + +Passed on the second run with no code change. The test produces 30 frames at a fixed rate and expects all three consumers to keep up. The Jetson Orin Nano Super (6-core Cortex-A78AE at ~2 GHz) is significantly slower than the macOS dev box where the test was originally tuned, so the per-frame timing budget (the source of the 25/30 cutoff at line 78) is too tight for this hardware under load (e.g., during a cold `cargo build` of the next test binary). + +## Fix sketch + +Two options: + +1. **Relax the timing budget** in `crates/frame_ingest/tests/publisher.rs:78` to allow longer per-frame deadlines, OR derive it from a measured baseline so a slow host gets proportionally more time. The test's INTENT — "all three consumers receive all 30 frames" — is preserved; only the synthetic rate is adjusted. + +2. **Mark the test `#[ignore]` on aarch64-linux with a comment pointing here**, then add a slower-rate variant that runs everywhere. This keeps the original test as a "ideal-hardware" check. + +Option 1 is cleaner and matches the existing pattern in the same crate (`ac2_slow_consumer_drops_while_fast_consumers_unaffected` uses a fixed but generous rate). + +## Acceptance for closing this leftover + +- `cargo test -p frame_ingest --test publisher` passes on the first run in `Dockerfile.test` on `jetson-e2e`, three consecutive times. +- Test intent (zero-frame-loss across 3 consumers at the configured rate) is preserved. + +## Suggested owner + +Whichever batch next touches `frame_ingest`. Same batch as `2026-05-20_frame_ingest_cuvid_segv.md` if both can be addressed together. diff --git a/crates/movement_detector/src/internal/optical_flow/mod.rs b/crates/movement_detector/src/internal/optical_flow/mod.rs index 8ed3219..7d44127 100644 --- a/crates/movement_detector/src/internal/optical_flow/mod.rs +++ b/crates/movement_detector/src/internal/optical_flow/mod.rs @@ -38,10 +38,10 @@ pub fn is_degenerate(gray: &Mat) -> opencv::Result { let mut max_val = 0.0f64; core::min_max_loc( gray, - &mut min_val, - &mut max_val, - &mut core::Point::default(), - &mut core::Point::default(), + Some(&mut min_val), + Some(&mut max_val), + None, + None, &core::no_array(), )?; Ok((max_val - min_val) < 10.0) @@ -66,8 +66,9 @@ pub fn frame_to_gray(frame: &Frame) -> opencv::Result { h, w, core::CV_8UC3, core::Scalar::all(0.0), )?; // SAFETY: rgb_mat is a freshly allocated continuous Mat; no aliasing. + // `data_mut()` returns `*mut u8` directly in opencv 0.98 (no Result). let mat_data = unsafe { - std::slice::from_raw_parts_mut(rgb_mat.data_mut()?, rgb_len) + std::slice::from_raw_parts_mut(rgb_mat.data_mut(), rgb_len) }; mat_data.copy_from_slice(&data[..rgb_len]); let mut gray = Mat::default(); @@ -81,8 +82,9 @@ fn copy_bytes_to_gray_mat(src: &[u8], w: i32, h: i32) -> opencv::Result { let mut mat = Mat::new_rows_cols_with_default(h, w, core::CV_8UC1, core::Scalar::all(0.0))?; // SAFETY: mat is a freshly allocated continuous Mat; no aliasing. + // `data_mut()` returns `*mut u8` directly in opencv 0.98 (no Result). let mat_data = unsafe { - std::slice::from_raw_parts_mut(mat.data_mut()?, src.len()) + std::slice::from_raw_parts_mut(mat.data_mut(), src.len()) }; mat_data.copy_from_slice(src); Ok(mat) @@ -169,10 +171,26 @@ pub fn estimate_homography( } } - // 6. Mean reprojection residual across all tracked inliers. + // 6. Mean reprojection residual across RANSAC inliers ONLY. + // + // `find_homography(..., RANSAC, 3.0)` populates `mask` with 1 for + // inlier point pairs (consistent with the fitted homography to + // within 3 px) and 0 for outliers. Including outliers in the + // residual would defeat the purpose of RANSAC: a synthetic pure + // pan can have edge features whose LK-tracked flow is off by the + // shift amount (the post-shift region falls outside the original + // frame); those points become RANSAC outliers and would otherwise + // inflate the residual by several pixels. let mut total = 0.0f32; let mut count = 0u32; for i in 0..good_prev.len() { + let is_inlier = mask + .at_2d::(i as i32, 0) + .map(|v| *v != 0) + .unwrap_or(false); + if !is_inlier { + continue; + } let p = good_prev.get(i)?; let c = good_curr.get(i)?; let x = p.x as f64; diff --git a/crates/semantic_analyzer/src/internal/scoring/freshness.rs b/crates/semantic_analyzer/src/internal/scoring/freshness.rs index 53ea20e..45f4a6a 100644 --- a/crates/semantic_analyzer/src/internal/scoring/freshness.rs +++ b/crates/semantic_analyzer/src/internal/scoring/freshness.rs @@ -53,7 +53,8 @@ fn frame_to_gray_mat(frame: &Frame) -> opencv::Result { let y_len = (w * h) as usize; let mut mat = Mat::new_rows_cols_with_default(h, w, core::CV_8UC1, Scalar::all(0.0))?; // SAFETY: freshly allocated continuous Mat; no aliasing. - let dst = unsafe { std::slice::from_raw_parts_mut(mat.data_mut()?, y_len) }; + // `data_mut()` returns `*mut u8` directly in opencv 0.98 (no Result). + let dst = unsafe { std::slice::from_raw_parts_mut(mat.data_mut(), y_len) }; dst.copy_from_slice(&data[..y_len]); Ok(mat) } @@ -61,7 +62,7 @@ fn frame_to_gray_mat(frame: &Frame) -> opencv::Result { let rgb_len = (w * h * 3) as usize; let mut rgb = Mat::new_rows_cols_with_default(h, w, core::CV_8UC3, Scalar::all(0.0))?; - let dst = unsafe { std::slice::from_raw_parts_mut(rgb.data_mut()?, rgb_len) }; + let dst = unsafe { std::slice::from_raw_parts_mut(rgb.data_mut(), rgb_len) }; dst.copy_from_slice(&data[..rgb_len]); let mut gray = Mat::default(); imgproc::cvt_color(&rgb, &mut gray, imgproc::COLOR_RGB2GRAY, 0)?; @@ -128,7 +129,9 @@ fn variance(mat: &Mat) -> opencv::Result { Ok(std * std) } -fn stddev_f32(mat: &Mat) -> opencv::Result { +// Accept `&impl ToInputArray` so both `&Mat` and `&BoxedRef` (returned +// by `Mat::roi` in opencv 0.98) can be passed without manual deref. +fn stddev_f32(mat: &impl core::ToInputArray) -> opencv::Result { let mut mean_mat = Mat::default(); let mut stddev_mat = Mat::default(); core::mean_std_dev(mat, &mut mean_mat, &mut stddev_mat, &core::no_array())?; diff --git a/scripts/jetson-test.sh b/scripts/jetson-test.sh new file mode 100755 index 0000000..f86bc83 --- /dev/null +++ b/scripts/jetson-test.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# +# jetson-test.sh — run the full `cargo test --workspace` on the +# production-target Jetson via Docker (Dockerfile.test). +# +# Why this exists: the autopilot workspace depends on `opencv`, +# `ffmpeg-next`, `tonic`/`prost`, and `libclang` at build time. The +# macOS dev box does not always have native OpenCV / FFmpeg headers +# installed, and the production target is a Jetson Orin Nano Super +# running JetPack 6 (Ubuntu 22.04 aarch64). This script bridges the +# two by rsync-ing the source to the Jetson and running tests inside +# a container that mirrors the production environment exactly. +# +# Prerequisites: +# - `ssh ` works passwordlessly (key-based auth). +# - The remote user is in the `docker` group (no sudo for `docker`). +# - `rsync` is available locally. +# +# Usage: +# scripts/jetson-test.sh # full suite, default host +# SSH_HOST=other-jetson scripts/jetson-test.sh +# scripts/jetson-test.sh -p movement_detector -p semantic_analyzer +# +# Any args passed after the script name are forwarded to `cargo test` +# inside the container (the default is the workspace suite). +# +# Exit code: forwarded from `cargo test` inside the container. + +set -euo pipefail + +SSH_HOST="${SSH_HOST:-jetson-e2e}" +REMOTE_DIR="${REMOTE_DIR:-/home/jetson/autopilot}" +IMAGE_TAG="${IMAGE_TAG:-autopilot-test}" +DOCKERFILE="${DOCKERFILE:-Dockerfile.test}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +cd "${REPO_ROOT}" + +log() { + printf '\n\033[1;34m[jetson-test]\033[0m %s\n' "$*" +} + +log "Sync source to ${SSH_HOST}:${REMOTE_DIR} (excluding target/, .git/)" +ssh "${SSH_HOST}" "mkdir -p '${REMOTE_DIR}'" +rsync -az --delete \ + --exclude='target/' \ + --exclude='.git/' \ + --exclude='.DS_Store' \ + --exclude='*.swp' \ + --exclude='node_modules/' \ + ./ "${SSH_HOST}:${REMOTE_DIR}/" + +log "Build test image ${IMAGE_TAG} on ${SSH_HOST} (may take 10-20 min on first run)" +ssh "${SSH_HOST}" "cd '${REMOTE_DIR}' && docker build --pull -t '${IMAGE_TAG}' -f '${DOCKERFILE}' ." + +if [[ $# -gt 0 ]]; then + log "Running: cargo test $*" + ssh "${SSH_HOST}" \ + "cd '${REMOTE_DIR}' && docker run --rm \ + -v '${REMOTE_DIR}:/workspace' \ + -w /workspace \ + '${IMAGE_TAG}' \ + cargo test $*" +else + log "Running default: cargo test --workspace --no-fail-fast --color always" + ssh "${SSH_HOST}" \ + "cd '${REMOTE_DIR}' && docker run --rm \ + -v '${REMOTE_DIR}:/workspace' \ + -w /workspace \ + '${IMAGE_TAG}'" +fi