diff --git a/_docs/03_implementation/jetson_harness_setup.md b/_docs/03_implementation/jetson_harness_setup.md new file mode 100644 index 0000000..50e19e6 --- /dev/null +++ b/_docs/03_implementation/jetson_harness_setup.md @@ -0,0 +1,175 @@ +# Jetson e2e Harness — Operator Setup + +AZ-615 / AZ-602 cycle-2. Documents the one-time operator-side setup +that makes `scripts/run-tests-jetson.sh` work against a Jetson Orin Nano +reachable from the developer Mac over SSH. + +## Why a separate Jetson harness exists + +The Colima/Tier-1 smoke harness (`docker-compose.test.yml` + +`tests/e2e/Dockerfile`) verifies wiring, env config, fixture loading, +auto-sync, and JSONL schema — everything UP TO the GPU boundary. But +all three C7 inference strategies +(`pytorch_fp16_runtime.py`, `tensorrt_runtime.py`, +`onnx_trt_ep_runtime.py`) are CUDA-only by design (`model.half().cuda()` +on `pytorch_fp16_runtime.py:189`, no CPU fallback). The full Reality +Gate — including C3 matcher + C7 inference — therefore needs a +CUDA-capable host. + +The Jetson harness runs the same test tree (`tests/e2e/`) on the Jetson +with `GPS_DENIED_TIER=2`, which turns OFF the auto-skip for +`@pytest.mark.tier2` tests (see `tests/conftest.py:31-44`). + +## Hardware contract + +Operator-confirmed environment (2026-05-17): + +* Jetson Orin Nano dev kit +* JetPack 6.2.2+b24 +* L4T R36.5.0 (Jan 2026) +* nvidia-container-toolkit 1.16.2 +* ≥ 30 GB free on `/var/lib/docker` (l4t-pytorch base image ~7 GB + + build cache + fixture volumes) +* Swap enabled (Orin Nano has 8 GB RAM; PyTorch + TensorRT loads spike) + +## One-time setup + +### 1. SSH key + alias (on the Mac) + +```bash +# Generate a dedicated keypair (separate from your daily-dev key) +ssh-keygen -t ed25519 -a 100 -f ~/.ssh/id_ed25519_jetson_e2e \ + -C "jetson-e2e $(date +%Y-%m-%d)" + +# Push the public half to the Jetson (asks for the Jetson password once) +ssh-copy-id -i ~/.ssh/id_ed25519_jetson_e2e.pub @ + +# Verify the Jetson's host key (run this ON the Jetson, via HDMI/serial, +# not over the LAN you're about to trust): +# ssh-keygen -lf /etc/ssh/ssh_host_ed25519_key.pub +# Then compare against what the Mac sees on first connect. Accept only +# if they match. + +# Wire up ~/.ssh/config (gitignored, never committed) +cat >> ~/.ssh/config <<'EOF' +Host jetson-e2e + HostName + User + IdentityFile ~/.ssh/id_ed25519_jetson_e2e + IdentitiesOnly yes + AddKeysToAgent yes + UseKeychain yes + StrictHostKeyChecking yes + ServerAliveInterval 30 + ServerAliveCountMax 4 +EOF + +# Cache the passphrase into macOS Keychain (one-time) +ssh-add --apple-use-keychain ~/.ssh/id_ed25519_jetson_e2e +``` + +### 2. Restrict the key's scope on the Jetson (recommended) + +Edit `~/.ssh/authorized_keys` on the Jetson and prefix the line that the +`ssh-copy-id` step appended: + +``` +from="",no-port-forwarding,no-X11-forwarding,no-agent-forwarding ssh-ed25519 AAAA… jetson-e2e +``` + +Optionally lock to "only run the e2e driver" by adding +`command="docker compose -f /home/jetson/gps-denied-onboard/docker-compose.test.jetson.yml up --abort-on-container-exit"` — +the key can't get a general shell, only invoke that one command. + +### 3. Harden sshd (optional, recommended for an exposed test rig) + +On the Jetson, create `/etc/ssh/sshd_config.d/10-e2e.conf`: + +``` +PasswordAuthentication no +PermitRootLogin no +PubkeyAuthentication yes +``` + +Then `sudo systemctl reload ssh`. + +### 4. Verify the Jetson Docker + GPU pipeline + +```bash +ssh jetson-e2e 'docker run --rm --runtime=nvidia --gpus all \ + nvcr.io/nvidia/l4t-base:r36.4.0 nvidia-smi' +``` + +Expected output: a `nvidia-smi`-style table listing the Orin GPU. If +this fails with "runtime not found" or "no GPU devices", install +`nvidia-container-toolkit` and `sudo systemctl restart docker`. + +### 5. Confirm disk + swap + +```bash +ssh jetson-e2e 'df -h /var/lib/docker && swapon --show && free -h' +``` + +Need ≥ 30 GB free on `/var/lib/docker`. Swap should be at least 4 GB +(JetPack default is 4 GB zram). + +## Running the harness + +From the developer Mac, repo root: + +```bash +bash scripts/run-tests-jetson.sh +``` + +What happens: + +1. `rsync` source → `jetson-e2e:~/gps-denied-onboard/` (excludes `.git`, + `__pycache__`, build artefacts; LFS pointers transfer as text). +2. `ssh jetson-e2e docker compose -f docker-compose.test.jetson.yml build e2e-runner` +3. `ssh jetson-e2e docker compose ... up --abort-on-container-exit --exit-code-from e2e-runner` +4. stdout / stderr stream to the Mac terminal; exit code propagates. + +Override the alias or remote dir if your setup differs: + +```bash +JETSON_SSH_ALIAS=other-host JETSON_REMOTE_DIR=~/somewhere/else \ + bash scripts/run-tests-jetson.sh +``` + +## Smoke vs. Reality Gate split — at a glance + +| Test category | Marker | Colima (Tier-1) | Jetson (Tier-2) | +|---------------|--------|-----------------|-----------------| +| AC-4a AST scan | (none) | runs | runs | +| AC-4b byte-equality | (none) | runs | runs | +| AC-7 skip-gate self-check | (none) | runs | runs | +| AC-9 helper unit tests | (none) | runs | runs | +| AC-1 / AC-2 / AC-3 / AC-5 / AC-6 (heavy) | `tier2` | **SKIPPED** | runs | +| AC-8 operator workflow | `skip` (AZ-616 blocks) | skipped | skipped | + +`GPS_DENIED_TIER` env var controls the auto-skip: + +* `GPS_DENIED_TIER=1` (Colima default) → `tier2` / `gpu` / `docker` + marked tests auto-skipped via `tests/conftest.py:31-44`. +* `GPS_DENIED_TIER=2` (Jetson default) → all markers active; everything + runs (subject to other skip gates like `RUN_REPLAY_E2E`). + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|--------------|-----| +| `cannot reach 'ssh jetson-e2e' non-interactively` | Agent isn't unlocked or key not in `authorized_keys` | `ssh-add -l` on Mac; check `~/.ssh/authorized_keys` on Jetson | +| `docker: Error response from daemon: could not select device driver "nvidia"` | nvidia-container-toolkit missing or daemon not restarted after install | `sudo apt install nvidia-container-toolkit && sudo systemctl restart docker` | +| `torch.cuda.is_available() == False` inside the container | `runtime: nvidia` block missing, or building on x86 host | Verify `docker-compose.test.jetson.yml` has `runtime: nvidia`; rebuild on the Jetson | +| `replay.auto_sync.ac8_validation_failed` | AZ-614 (tlog time-base mismatch) — not a harness bug | Fix AZ-614 in `tests/e2e/replay/_tlog_synth.py` | +| `pull access denied for nvcr.io/nvidia/l4t-pytorch` | NGC requires login for some tags | `docker login nvcr.io` (use NGC API key from developer.nvidia.com) | + +## Related Jira + +* AZ-615 — this harness (Jetson runner story) +* AZ-616 — replace `mock-sat` with real `../satellite-provider` service +* AZ-617 — mark heavy ACs with `tier2` (already applied; this story + documents and verifies the auto-skip) +* AZ-614 — tlog time-base mismatch (currently blocks the heavy ACs + from reaching the GPU stage) +* AZ-602 — parent Epic: E2E Tier-1 harness rehabilitation diff --git a/docker-compose.test.jetson.yml b/docker-compose.test.jetson.yml new file mode 100644 index 0000000..680ac0c --- /dev/null +++ b/docker-compose.test.jetson.yml @@ -0,0 +1,98 @@ +# Tier-2 e2e harness — Jetson Orin Nano (JetPack 6.x, L4T R36.x). +# +# AZ-615: companion compose file to `docker-compose.test.yml` that runs +# the full Reality Gate on a CUDA-capable host. Used via `ssh jetson-e2e +# "docker compose -f docker-compose.test.jetson.yml up ..."` driven by +# `scripts/run-tests-jetson.sh`. +# +# Difference vs. docker-compose.test.yml: +# * `runtime: nvidia` + `gpus: all` on `e2e-runner` so the SUT can +# resolve `model.half().cuda()` against the Orin GPU. +# * `GPS_DENIED_TIER=2` — turns OFF the auto-skip for `@pytest.mark.tier2` +# ACs (see tests/conftest.py:31-44). The heavy ACs (AC-1, AC-2, AC-3, +# AC-5, AC-6) actually run. +# * Builds from `tests/e2e/Dockerfile.jetson` (l4t-pytorch base). +# * Companion / db / mock-sat continue to come from the root +# `docker-compose.yml` via `extends:` (same as Colima) — they have ARM64 +# tags via the existing build pipeline. +# +# Satellite-provider integration (real .NET service at ../satellite-provider/) +# is tracked separately under AZ-616 and lands as a follow-up patch to this +# file once the auth + tile-source strategy is decided. + +services: + companion: + extends: + file: docker-compose.yml + service: companion + environment: + LOG_LEVEL: INFO + + operator-orchestrator: + extends: + file: docker-compose.yml + service: operator-orchestrator + + mock-sat: + extends: + file: docker-compose.yml + service: mock-sat + + db: + extends: + file: docker-compose.yml + service: db + + e2e-runner: + build: + context: . + dockerfile: tests/e2e/Dockerfile.jetson + image: gps-denied-onboard/e2e-runner:jetson + # nvidia-container-runtime exposes the Tegra GPU + libcuda mounts. + # Without this block the container starts but `torch.cuda.is_available()` + # returns False and every tier2 AC errors at `.cuda()`. + runtime: nvidia + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + companion: + condition: service_healthy + mock-sat: + condition: service_healthy + db: + condition: service_healthy + environment: + # Same FullSystemConfig env block as Colima — see comments in + # docker-compose.test.yml for the per-var rationale. + GPS_DENIED_FC_PROFILE: ardupilot_plane + # Tier-2 turns OFF the `tier2` / `gpu` auto-skip in tests/conftest.py + # so the heavy ACs in tests/e2e/replay/test_derkachi_1min.py actually + # execute. This is the WHOLE POINT of the Jetson harness. + GPS_DENIED_TIER: "2" + DB_URL: postgresql://gps_denied:dev@db:5432/gps_denied + SATELLITE_PROVIDER_URL: http://mock-sat:5100 + COMPANION_URL: http://companion:8080 + CAMERA_CALIBRATION_PATH: /opt/tests/fixtures/calibration/adti26.json + LOG_LEVEL: INFO + LOG_SINK: console + INFERENCE_BACKEND: pytorch_fp16 + FDR_PATH: /var/lib/gps-denied/fdr + TILE_CACHE_PATH: /var/lib/gps-denied/tiles + MAVLINK_SIGNING_KEY: /opt/tests/fixtures/mavlink_signing/dev_key + RUN_REPLAY_E2E: "1" + BUILD_REPLAY_SINK_JSONL: "ON" + volumes: + - ./tests:/opt/tests:ro + - ./_docs/00_problem/input_data:/opt/_docs/00_problem/input_data:ro + - fdr-data:/var/lib/gps-denied/fdr + - tile-data:/var/lib/gps-denied/tiles + +volumes: + db-data: {} + fdr-data: {} + tile-data: {} diff --git a/scripts/run-tests-jetson.sh b/scripts/run-tests-jetson.sh new file mode 100755 index 0000000..5f75b89 --- /dev/null +++ b/scripts/run-tests-jetson.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# AZ-615: drive the Tier-2 Reality Gate e2e harness on a remote Jetson. +# +# Runs from the developer Mac. Assumes: +# * `ssh jetson-e2e` works via key auth + ~/.ssh/config (see +# _docs/03_implementation/jetson_harness_setup.md for one-time setup). +# * The Jetson has docker + nvidia-container-toolkit + ≥ 30 GB free on +# /var/lib/docker. +# +# Flow: +# 1. rsync the working tree to the Jetson under ~/gps-denied-onboard/ +# (excluding .git, LFS pointers, build artefacts). +# 2. ssh into the Jetson and `docker compose build` the e2e-runner image +# against tests/e2e/Dockerfile.jetson. +# 3. ssh again and `docker compose up --abort-on-container-exit +# --exit-code-from e2e-runner` so the local exit code reflects the +# remote test verdict. +# 4. stdout / stderr stream back to the Mac terminal. +# +# Exit code propagates the docker-compose exit code (which == the +# e2e-runner container's exit code, which == pytest's verdict). + +set -euo pipefail + +# ---------------------------------------------------------------------- +# Configuration + +SSH_ALIAS="${JETSON_SSH_ALIAS:-jetson-e2e}" +REMOTE_DIR="${JETSON_REMOTE_DIR:-~/gps-denied-onboard}" +COMPOSE_FILE="docker-compose.test.jetson.yml" + +# Repo root regardless of where the script is invoked from. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# ---------------------------------------------------------------------- +# Pre-flight + +if ! command -v rsync >/dev/null 2>&1; then + echo "ERROR: rsync not on PATH — install with 'brew install rsync' or apt" >&2 + exit 64 +fi + +if ! ssh -o BatchMode=yes -o ConnectTimeout=5 "${SSH_ALIAS}" true 2>/dev/null; then + cat >&2 < None: # Act @@ -97,6 +98,7 @@ _ESTIMATOR_OUTPUT_KEYS = frozenset( ) +@pytest.mark.tier2 @_HEAVY_SKIP def test_ac2_jsonl_schema_match(replay_runner) -> None: # Act @@ -121,6 +123,7 @@ def test_ac2_jsonl_schema_match(replay_runner) -> None: # AC-3: ≥ 80 % of emissions within 100 m of ground truth +@pytest.mark.tier2 @_HEAVY_SKIP @pytest.mark.xfail( reason=( @@ -350,6 +353,7 @@ def test_ac4_encoder_byte_equality_via_transport_seam() -> None: # AC-5: Determinism (two runs differ by ≤ 1e-6 in position fields) +@pytest.mark.tier2 @_HEAVY_SKIP def test_ac5_determinism_two_runs_diff(replay_runner) -> None: # Act @@ -378,6 +382,7 @@ def test_ac5_determinism_two_runs_diff(replay_runner) -> None: # AC-6: Pace timing +@pytest.mark.tier2 @_HEAVY_SKIP def test_ac6_pace_realtime_60s_within_5pct(replay_runner) -> None: # Act @@ -391,6 +396,7 @@ def test_ac6_pace_realtime_60s_within_5pct(replay_runner) -> None: ) +@pytest.mark.tier2 @_HEAVY_SKIP def test_ac6_pace_asap_under_30s(replay_runner) -> None: # Act