[AZ-701] Fix Jetson e2e harness infrastructure blockers

- gtsam_isam2_estimator: shim for gtsam>=4.3a0 aarch64 pre-release
  where IncrementalFixedLagSmoother/FixedLagSmootherKeyTimestampMap
  moved from gtsam_unstable to gtsam
- inference_factory: eager import of c7_inference package so
  register_component_block runs before config.components is read
- docker-compose.test.jetson.yml: remove companion and
  operator-orchestrator (not needed by replay CLI tests and crash
  in test env due to AZ-618 live-mode deps); add db-migrate and
  tile-init setup-profile services for Alembic migrations and FAISS
  fixture provisioning; update e2e-runner depends_on to db only
- scripts/mk_test_faiss_fixture.py: generate minimal HNSW32 FAISS
  descriptor index into the tile-data volume for the test harness

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-20 19:01:36 +03:00
parent 1b65619524
commit 21a7784682
4 changed files with 143 additions and 30 deletions
+61 -27
View File
@@ -33,31 +33,66 @@
# gps-denied client (AZ-691) lands.
services:
companion:
extends:
file: docker-compose.yml
service: companion
environment:
LOG_LEVEL: INFO
# Jetson is the canonical test env (2026-05-20 policy); the FAISS
# HNSW descriptor index is required by c2_vpr in this binary.
# Without this flag airborne_bootstrap fails at
# _build_c6_descriptor_index → RuntimeNotAvailableError. faiss-cpu
# is installed via the [dev] extra; the gate is build-flag, not
# wheel availability.
BUILD_FAISS_INDEX: "ON"
# ------------------------------------------------------------------
# Init services (profiles: [setup]) — NOT started by the default
# `docker compose up`. They are invoked explicitly as one-shot jobs
# via `docker compose run --rm --profile setup <service>` before the
# main harness run:
#
# 1. db-migrate — applies Alembic migrations so companion's
# FreshnessGate / PostgresFilesystemStore find their tables.
# (AZ-618 ordering gap: build_pre_constructed queries the DB
# before the composition root can call apply_migrations.)
#
# 2. tile-init — writes a minimal valid HNSW32 FAISS descriptor
# index into the tile-data volume so FaissDescriptorIndex._load()
# succeeds during build_pre_constructed.
# (AZ-618 gap: the production provisioning pipeline normally
# writes the index; in the test harness it must pre-exist.)
#
# They are in profile "setup" so they do not participate in the
# default `docker compose up` and do not trip --abort-on-container-exit.
# ------------------------------------------------------------------
db-migrate:
profiles: ["setup"]
image: gps-denied-onboard/e2e-runner:jetson
entrypoint: ["alembic"]
command: ["upgrade", "head"]
working_dir: /opt/project
volumes:
- .:/opt/project:ro
depends_on:
db:
condition: service_healthy
restart: "no"
operator-orchestrator:
extends:
file: docker-compose.yml
service: operator-orchestrator
environment:
BUILD_FAISS_INDEX: "ON"
tile-init:
profiles: ["setup"]
image: gps-denied-onboard/e2e-runner:jetson
entrypoint: ["python3"]
command: ["/opt/project/scripts/mk_test_faiss_fixture.py"]
volumes:
- .:/opt/project:ro
- tile-data:/var/lib/gps-denied/tiles
restart: "no"
mock-sat:
extends:
file: docker-compose.yml
service: mock-sat
# companion and operator-orchestrator are intentionally absent from
# the Jetson e2e test harness.
#
# Every test in tests/e2e/replay/ invokes the ``gps-denied-replay``
# console-script directly as a subprocess and does not call the
# companion or operator-orchestrator HTTP APIs. Including either
# service caused the harness to abort before any test could run:
#
# * companion crashes at startup because live-mode requires a
# production-provisioned C7 inference engine (PyTorch FP16 or
# TensorRT) that is absent from the test environment. This is the
# pre-existing AZ-618 gap (build_pre_constructed fails before the
# composition root can apply_migrations + engine artifacts).
# * operator-orchestrator crashed for the same C7 inference reason.
#
# When the AZ-618 epic ships the full airborne boot-up in a sandboxed
# environment (Phase E / engine stubs), companion can be re-added here.
db:
extends:
@@ -81,10 +116,6 @@ services:
count: all
capabilities: [gpu]
depends_on:
companion:
condition: service_healthy
mock-sat:
condition: service_healthy
db:
condition: service_healthy
environment:
@@ -96,6 +127,9 @@ services:
# execute. This is the WHOLE POINT of the Jetson harness.
GPS_DENIED_TIER: "2"
DB_URL: postgresql://gps_denied:dev@db:5432/gps_denied
# SATELLITE_PROVIDER_URL / COMPANION_URL are set but not used by
# the replay CLI tests (gps-denied-replay runs as a subprocess and
# does not call the companion or satellite-provider HTTP APIs).
SATELLITE_PROVIDER_URL: http://mock-sat:5100
COMPANION_URL: http://companion:8080
CAMERA_CALIBRATION_PATH: /opt/tests/fixtures/calibration/adti26.json
+61
View File
@@ -0,0 +1,61 @@
#!/usr/bin/env python3
"""Create a minimal valid FAISS HNSW32 + IndexIDMap2 fixture for the test harness.
Used by the `tile-init` init service in docker-compose.test.jetson.yml.
Writes three files to /var/lib/gps-denied/tiles/:
descriptor.index — empty HNSW32 dim=512 binary
descriptor.index.sha256 — sha256 sidecar (matches FaissDescriptorIndex._load)
descriptor.index.meta.json — metadata (descriptor_dim, hnsw_params.metric, ...)
Running this twice is idempotent (overwrites the previous fixture).
"""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from pathlib import Path
import faiss # type: ignore[import-untyped]
DESCRIPTOR_DIM = 512
HNSW_M = 32
root = Path("/var/lib/gps-denied/tiles")
root.mkdir(parents=True, exist_ok=True)
inner = faiss.IndexHNSWFlat(DESCRIPTOR_DIM, HNSW_M, faiss.METRIC_INNER_PRODUCT)
index = faiss.IndexIDMap2(inner)
idx_path = root / "descriptor.index"
faiss.write_index(index, str(idx_path))
idx_bytes = idx_path.read_bytes()
sha256 = hashlib.sha256(idx_bytes).hexdigest()
(idx_path.parent / (idx_path.name + ".sha256")).write_text(sha256, encoding="ascii")
meta = {
"descriptor_dim": DESCRIPTOR_DIM,
"n_vectors": 0,
"backbone_label": "ultra_vpr",
"backbone_sha256_hex": "0" * 64,
"built_at": datetime.now(timezone.utc).isoformat(),
"hnsw_params": {
"m": HNSW_M,
"ef_construction": 40,
"ef_search": 16,
"metric": "INNER_PRODUCT",
},
"sidecar_sha256_hex": sha256,
"file_path": str(idx_path),
"id_mapping": [],
}
(idx_path.parent / (idx_path.name + ".meta.json")).write_text(
json.dumps(meta, sort_keys=True, indent=2), encoding="utf-8"
)
print(
f"[tile-init] OK: empty HNSW32 dim={DESCRIPTOR_DIM} index "
f"at {idx_path} sha256={sha256[:16]}..."
)
@@ -39,6 +39,17 @@ from uuid import UUID, uuid4
import gtsam
import gtsam_unstable
import numpy as np
# gtsam >=4.3a0 (aarch64 pre-release) moved IncrementalFixedLagSmoother and
# FixedLagSmootherKeyTimestampMap to the main gtsam module. Fall back to gtsam
# when gtsam_unstable no longer carries these symbols so the estimator works
# on both 4.2.x (x86_64 PyPI) and 4.3a0 (aarch64 pre-release).
try:
_IncrementalFixedLagSmoother = gtsam_unstable.IncrementalFixedLagSmoother
_FixedLagSmootherKeyTimestampMap = gtsam_unstable.FixedLagSmootherKeyTimestampMap
except AttributeError:
_IncrementalFixedLagSmoother = gtsam.IncrementalFixedLagSmoother
_FixedLagSmootherKeyTimestampMap = gtsam.FixedLagSmootherKeyTimestampMap
from numpy.linalg import LinAlgError
from gps_denied_onboard._types.geo import LatLonAlt
@@ -168,7 +179,7 @@ class GtsamIsam2StateEstimator(StateEstimator):
self._isam2 = gtsam.ISAM2(gtsam.ISAM2Params())
window_seconds: float = block.keyframe_window_size * _FRAME_PERIOD_S
self._smoother = gtsam_unstable.IncrementalFixedLagSmoother(window_seconds)
self._smoother = _IncrementalFixedLagSmoother(window_seconds)
self._graph = gtsam.NonlinearFactorGraph()
self._values = gtsam.Values()
@@ -1689,14 +1700,14 @@ def _build_pose_noise(covariance: Any | None) -> gtsam.noiseModel.Base:
def _make_timestamp_map(
keys: list[int], ts_ns: int
) -> gtsam_unstable.FixedLagSmootherKeyTimestampMap:
) -> _FixedLagSmootherKeyTimestampMap:
"""Build a ``FixedLagSmootherKeyTimestampMap`` for the smoother.
The smoother needs per-key arrival timestamps in seconds (its
sliding-window evict logic uses them); we feed every newly
inserted key the same window-end timestamp.
"""
ts_map = gtsam_unstable.FixedLagSmootherKeyTimestampMap()
ts_map = _FixedLagSmootherKeyTimestampMap()
ts_seconds = ts_ns * 1e-9
for key in keys:
ts_map.insert((key, ts_seconds))
@@ -20,6 +20,13 @@ from typing import TYPE_CHECKING
from gps_denied_onboard.runtime_root.errors import RuntimeNotAvailableError
# Eager package import so c7_inference.__init__.py runs
# `register_component_block("c7_inference", C7InferenceConfig)` before
# `_c7_config(config)` reads `config.components["c7_inference"]` below.
# The package __init__.py is import-safe (no concrete strategy modules)
# per the Risk-2 mitigation documented in c7_inference/__init__.py.
import gps_denied_onboard.components.c7_inference # noqa: F401
if TYPE_CHECKING:
from gps_denied_onboard.components.c7_inference import (
C7InferenceConfig,