[AZ-701] HTTP replay API service (FastAPI + magic-byte upload validation)
ci/woodpecker/push/02-build-push Pipeline failed

New replay_api component: FastAPI service wrapping the offline
gps-denied-replay pipeline. POST tlog+video (multipart) → either
sync 200 with result/map/report URLs, or async 202 + job id with
/jobs/{id} polling. Magic-byte validation, bearer auth, in-memory
JobRegistry with concurrency + queue caps (429 on overflow).

Helper accuracy_report.py promoted from tests/ to src/ because the
API needs the Markdown report writer at runtime; all AZ-699 imports
re-pointed. OpenAPI spec exported to docs.

18/18 unit tests pass (AC-1 sync, AC-2 async, AC-3 state machine,
AC-5 auth, AC-6 health, AC-8 concurrency, AC-9 magic-byte). Full
unit suite: 2251 pass, 86 skip, 1 pre-existing C12 cold-start flake
(unchanged). mypy --strict clean on the new surface.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-05-20 17:30:26 +03:00
parent b66b68ff76
commit 7d53cef0cf
22 changed files with 2854 additions and 13 deletions
@@ -0,0 +1,82 @@
"""AZ-701 ``replay-api`` console-script.
Builds the FastAPI app from environment configuration and starts
the uvicorn server. Mirrors the operator-side CLI style of
``gps-denied-replay`` and ``gps-denied-render-map``.
"""
from __future__ import annotations
import argparse
import logging
import os
from pathlib import Path
from gps_denied_onboard.replay_api.app import build_runner_from_env, create_app
from gps_denied_onboard.replay_api.storage import StorageRoot
__all__ = ["main"]
_LOGGER = logging.getLogger("gps_denied_onboard.cli.replay_api")
def _build_argparser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
prog="replay-api",
description=(
"Start the gps-denied-onboard replay HTTP API. "
"Upload (tlog + video [+ calibration]); receive GPS "
"fixes + an accuracy report + an HTML map."
),
)
parser.add_argument(
"--host", default=os.environ.get("REPLAY_API_HOST", "0.0.0.0")
)
parser.add_argument(
"--port",
type=int,
default=int(os.environ.get("REPLAY_API_PORT", "8080")),
)
parser.add_argument(
"--storage-root",
type=Path,
default=Path(
os.environ.get(
"REPLAY_API_STORAGE_ROOT", "/var/azaion/replay_api"
)
),
)
parser.add_argument(
"--reload",
action="store_true",
help="Reload on code changes (dev only).",
)
return parser
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=os.environ.get("REPLAY_API_LOG_LEVEL", "INFO"),
format="%(asctime)s %(name)s %(levelname)s %(message)s",
)
try:
import uvicorn
except ImportError:
raise SystemExit(
"uvicorn is not installed. Install with "
"`pip install gps-denied-onboard[operator-tools]`."
)
storage = StorageRoot(args.storage_root)
runner = build_runner_from_env()
app = create_app(runner=runner, storage=storage)
uvicorn.run(
app,
host=args.host,
port=args.port,
reload=args.reload,
log_level=os.environ.get("REPLAY_API_LOG_LEVEL", "info").lower(),
)
return 0
@@ -16,6 +16,14 @@ from gps_denied_onboard.helpers.engine_filename_schema import (
EngineFilenameSchema,
EngineFilenameSchemaError,
)
from gps_denied_onboard.helpers.accuracy_report import (
AC3_GATE_PCT,
AC3_GATE_THRESHOLD_M,
ReportContext,
format_failure_message,
render_report,
verdict_passes_ac3,
)
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
HorizontalErrorDistribution,
@@ -67,10 +75,13 @@ from gps_denied_onboard.helpers.wgs_converter import (
)
__all__ = [
"AC3_GATE_PCT",
"AC3_GATE_THRESHOLD_M",
"ALLOWED_DTYPES",
"ALLOWED_PRECISIONS",
"ENGINE_SUFFIX",
"MAX_ZOOM",
"ReportContext",
"SE3",
"SIDECAR_SUFFIX",
"WEB_MERCATOR_MAX_LAT_DEG",
@@ -96,6 +107,7 @@ __all__ = [
"WgsConverter",
"adjoint",
"exp_map",
"format_failure_message",
"horizontal_error_distribution",
"is_valid_rotation",
"iso_ts_from_clock",
@@ -106,5 +118,7 @@ __all__ = [
"make_imu_preintegrator",
"matrix_to_se3",
"percentile_sorted",
"render_report",
"se3_to_matrix",
"verdict_passes_ac3",
]
@@ -0,0 +1,190 @@
"""Markdown accuracy-report writer (AZ-699 + AZ-701).
Renders a :class:`HorizontalErrorDistribution` (the production
helper in ``gps_denied_onboard.helpers.gps_compare``) plus run
context (calibration acquisition method, clip duration, fixture
paths) into the canonical Markdown layout consumed by
``_docs/06_metrics/real_flight_validation_{date}.md``.
Originally implemented as a test helper under
``tests/e2e/replay/_report_writer.py`` (AZ-699 batch 100). Promoted
to production code in AZ-701 (batch 102) because the ``replay_api``
HTTP service needs to render the same report for every replay job
the operator submits, and a test-only helper cannot be imported
from production code per the module-layout rule.
Style: every function is pure; the side effect (writing the file)
is the caller's. Tests in ``tests/unit/test_az699_report_writer.py``
exercise both the rendering and the threshold-gate verdict logic.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from gps_denied_onboard.helpers.gps_compare import HorizontalErrorDistribution
__all__ = [
"AC3_GATE_PCT",
"AC3_GATE_THRESHOLD_M",
"ReportContext",
"format_failure_message",
"render_report",
"verdict_passes_ac3",
]
# AZ-696 epic AC-3 threshold + minimum-share gate. Keeping these
# named constants here (rather than inlined into the test) so the
# unit tests for the failure-message template can pin them.
AC3_GATE_THRESHOLD_M: float = 100.0
AC3_GATE_PCT: float = 80.0
@dataclass(frozen=True)
class ReportContext:
"""Run context surfaced in the report header.
Attributes:
run_date_utc: ISO-8601 date (YYYY-MM-DD) at which the run
executed — drives the report filename.
tlog_path: Real tlog the runner consumed.
video_path: Video clip the runner consumed.
calibration_acquisition_method: Provenance of the camera
calibration (e.g. ``"factory-sheet"`` for AZ-702 or
``"placeholder"`` for the adti26 fallback). Surfaced in
the failure message per AZ-699 AC-3.
clip_duration_s: Duration of the analysed clip in seconds.
emissions_count: Total estimator-output records consumed
from the JSONL (may differ from
``distribution.count`` when some emissions land
outside the GT window).
"""
run_date_utc: str
tlog_path: Path
video_path: Path
calibration_acquisition_method: str
clip_duration_s: float
emissions_count: int
def verdict_passes_ac3(distribution: HorizontalErrorDistribution) -> bool:
"""Return ``True`` when the run meets AZ-696 epic AC-3."""
if distribution.count == 0:
return False
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M)
if share is None:
return False
return share * 100.0 >= AC3_GATE_PCT
def format_failure_message(
distribution: HorizontalErrorDistribution,
context: ReportContext,
) -> str:
"""Build the honest failure message for AZ-699 AC-3.
The message references the calibration acquisition method
(factory-sheet for AZ-702 or placeholder otherwise) and the
measured residual budget, so the operator can attribute a
failure to its likely root cause (calibration uncertainty,
drift, anchor scarcity) without re-reading the source.
"""
share = distribution.threshold_hit_share.get(AC3_GATE_THRESHOLD_M, 0.0)
pct = share * 100.0
return (
f"AZ-699 AC-3: only {pct:.1f} % of {distribution.count} "
f"emissions within {AC3_GATE_THRESHOLD_M:.0f} m of ground "
f"truth; epic threshold is {AC3_GATE_PCT:.0f} %. "
f"Residual: mean={distribution.horizontal_error_mean_m:.1f} m, "
f"p50={distribution.horizontal_error_p50_m:.1f} m, "
f"p95={distribution.horizontal_error_p95_m:.1f} m, "
f"p99={distribution.horizontal_error_p99_m:.1f} m. "
f"Calibration: {context.calibration_acquisition_method}. "
"See _docs/06_metrics/real_flight_validation_"
f"{context.run_date_utc}.md for the full distribution."
)
def render_report(
distribution: HorizontalErrorDistribution,
context: ReportContext,
*,
passed: bool,
) -> str:
"""Render the full Markdown report body.
The output layout (header + horizontal-error stats + threshold
table + vertical-error stats + verdict) is the schema referenced
by ``_docs/02_document/tests/blackbox-tests.md``.
"""
verdict = "PASS" if passed else "FAIL"
horiz_rows = [
("Mean", distribution.horizontal_error_mean_m),
("p50", distribution.horizontal_error_p50_m),
("p95", distribution.horizontal_error_p95_m),
("p99", distribution.horizontal_error_p99_m),
]
threshold_rows = [
(t, share)
for t, share in sorted(distribution.threshold_hit_share.items())
]
lines: list[str] = []
lines.append(f"# Real-flight validation — {context.run_date_utc}")
lines.append("")
lines.append(f"**Verdict**: {verdict} (AC-3 gate: "
f"{AC3_GATE_PCT:.0f} % within "
f"{AC3_GATE_THRESHOLD_M:.0f} m)")
lines.append("")
lines.append("## Run context")
lines.append("")
lines.append(f"- Tlog: `{context.tlog_path}`")
lines.append(f"- Video: `{context.video_path}`")
lines.append(
f"- Calibration acquisition method: {context.calibration_acquisition_method}"
)
lines.append(f"- Clip duration: {context.clip_duration_s:.1f} s")
lines.append(f"- Emissions consumed: {context.emissions_count}")
lines.append(f"- Ground-truth pairings: {distribution.count}")
lines.append("")
lines.append("## Horizontal error (metres)")
lines.append("")
lines.append("| Statistic | Value |")
lines.append("| --------- | ----- |")
for name, value in horiz_rows:
lines.append(f"| {name} | {value:.2f} |")
lines.append("")
lines.append("## Threshold-hit share")
lines.append("")
lines.append("| Threshold (m) | Hit share (%) |")
lines.append("| ------------- | ------------- |")
for threshold, share in threshold_rows:
lines.append(f"| {threshold:g} | {share * 100.0:.1f} |")
lines.append("")
if distribution.vertical_count > 0:
lines.append("## Vertical error (metres)")
lines.append("")
lines.append("| Statistic | Value |")
lines.append("| --------- | ----- |")
lines.append(
f"| Mean | {distribution.vertical_error_mean_m:.2f} |"
)
lines.append(
f"| p50 | {distribution.vertical_error_p50_m:.2f} |"
)
lines.append(
f"| p95 | {distribution.vertical_error_p95_m:.2f} |"
)
lines.append(
f"| Samples | {distribution.vertical_count} |"
)
lines.append("")
else:
lines.append("## Vertical error")
lines.append("")
lines.append("_No emissions carried a comparable altitude — vertical stats skipped._")
lines.append("")
return "\n".join(lines) + "\n"
@@ -0,0 +1,54 @@
"""AZ-701 — `replay_api` HTTP service.
Operator-side HTTP wrapper around the offline replay pipeline:
`gps-denied-replay` (AZ-402) + `gps-denied-render-map` (AZ-700).
Lives outside the airborne binary — see contract at
``_docs/02_document/contracts/replay_api/replay_api_protocol.md``.
Public surface (re-exports below) is intentionally narrow:
- ``create_app`` — FastAPI app factory (for uvicorn + tests).
- ``JobRegistry`` + ``JobRecord`` + ``JobState`` — job-state machinery.
- ``ReplayRunner`` Protocol — DI seam (handlers depend on the
Protocol, not the concrete subprocess runner; unit tests inject
a fake runner).
- DTOs — ``JobSnapshot``, ``ReplayJobResult``.
"""
from __future__ import annotations
from gps_denied_onboard.replay_api.app import create_app
from gps_denied_onboard.replay_api.errors import (
JobNotCompleteError,
JobNotFoundError,
ReplayApiError,
ReplayRunnerError,
UnsupportedFileKindError,
)
from gps_denied_onboard.replay_api.interface import (
JobSnapshot,
JobState,
ReplayJobResult,
ReplayRunner,
)
from gps_denied_onboard.replay_api.jobs import (
ConcurrencyLimitReachedError,
JobRecord,
JobRegistry,
)
__all__ = [
"ConcurrencyLimitReachedError",
"JobNotCompleteError",
"JobNotFoundError",
"JobRecord",
"JobRegistry",
"JobSnapshot",
"JobState",
"ReplayApiError",
"ReplayJobResult",
"ReplayRunner",
"ReplayRunnerError",
"UnsupportedFileKindError",
"create_app",
]
+677
View File
@@ -0,0 +1,677 @@
"""AZ-701 — FastAPI app factory + production subprocess runner.
The factory takes the (runner, storage, registry) trio so unit
tests can wire in fakes; ``main()`` (in
``cli/replay_api_entrypoint.py``) constructs the production
subprocess runner against the configured environment.
Note: this file deliberately does NOT use ``from __future__ import
annotations``. FastAPI 0.119 + Pydantic 2.x resolve the route
parameter annotations at decoration time, which requires the
``Annotated[UploadFile, File()]`` form to be evaluable as real
types — not as forward-ref strings. Other modules in the
``replay_api`` package keep the future-annotations import; only
this one drops it for the route signatures.
"""
import logging
import os
import shutil
import subprocess
import sys
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path
from typing import Any
from gps_denied_onboard.replay_api.errors import (
ConcurrencyLimitReachedError,
JobNotCompleteError,
JobNotFoundError,
MultipartMissingFieldError,
PayloadTooLargeError,
ReplayApiError,
ReplayRunnerError,
UnauthorizedError,
UnsupportedFileKindError,
)
from gps_denied_onboard.replay_api.handlers import (
MIN_TLOG_PROBE_BYTES,
MIN_VIDEO_PROBE_BYTES,
auth_required,
expected_bearer_token,
extract_bearer_token,
validate_calibration_kind,
validate_tlog_kind,
validate_upload_size,
validate_video_kind,
)
from gps_denied_onboard.replay_api.interface import (
JobSnapshot,
JobState,
ReplayInputs,
ReplayJobResult,
ReplayRunner,
)
from gps_denied_onboard.replay_api.jobs import JobRegistry
from gps_denied_onboard.replay_api.storage import StorageRoot
__all__ = ["SubprocessReplayRunner", "build_runner_from_env", "create_app"]
_LOGGER = logging.getLogger("gps_denied_onboard.replay_api")
_PROBE_BYTES_MAX: int = max(MIN_TLOG_PROBE_BYTES, MIN_VIDEO_PROBE_BYTES, 64)
# ---------------------------------------------------------------------
# Production runner
class SubprocessReplayRunner:
"""Shells out to ``gps-denied-replay`` + ``gps-denied-render-map``.
Each ``run()`` call writes a minimal replay-mode ``config.yaml``
into the per-job output directory, invokes the replay CLI with
``--auto-trim``, computes the AZ-699 accuracy report from the
JSONL + the AZ-697 ground-truth extraction, and renders the
AZ-700 HTML map. The result is the trio of artefact paths the
handler streams back to the client.
"""
def __init__(
self,
*,
replay_binary: str = "gps-denied-replay",
render_binary: str = "gps-denied-render-map",
subprocess_timeout_s: float = 900.0,
) -> None:
self._replay_binary = replay_binary
self._render_binary = render_binary
self._timeout = subprocess_timeout_s
def run(
self, inputs: ReplayInputs, *, output_dir: Path
) -> ReplayJobResult:
config_path = output_dir / "config.yaml"
config_path.write_text(
"mode: replay\n"
"replay:\n"
f" pace: {inputs.pace}\n"
" target_fc_dialect: ardupilot_plane\n"
)
signing_key_path = output_dir / "signing_key.bin"
signing_key_path.write_bytes(b"\x00" * 32)
emissions_path = output_dir / "emissions.jsonl"
argv = [
self._replay_binary,
"--video",
str(inputs.video_path),
"--tlog",
str(inputs.tlog_path),
"--output",
str(emissions_path),
"--camera-calibration",
str(inputs.calibration_path),
"--config",
str(config_path),
"--mavlink-signing-key",
str(signing_key_path),
"--pace",
inputs.pace,
]
if inputs.auto_trim:
argv.append("--auto-trim")
replay_completed = subprocess.run(
argv,
capture_output=True,
text=True,
timeout=self._timeout,
)
if replay_completed.returncode != 0:
stderr_tail = (replay_completed.stderr or "")[-8192:]
raise ReplayRunnerError(
f"{self._replay_binary} exited "
f"{replay_completed.returncode}",
details={"stderr_tail": stderr_tail},
)
report_path = self._maybe_render_report(
inputs, emissions_path, output_dir
)
map_path = self._maybe_render_map(
inputs, emissions_path, output_dir, report_path
)
return ReplayJobResult(
emissions_jsonl_path=emissions_path,
accuracy_report_md_path=report_path,
map_html_path=map_path,
)
def _maybe_render_report(
self,
inputs: ReplayInputs,
emissions_path: Path,
output_dir: Path,
) -> Path | None:
"""Compute the AZ-699 accuracy report; tolerate missing GT."""
try:
import json
from gps_denied_onboard.helpers.accuracy_report import (
AC3_GATE_THRESHOLD_M,
ReportContext,
render_report,
verdict_passes_ac3,
)
from gps_denied_onboard.helpers.gps_compare import (
GroundTruthRow,
horizontal_error_distribution,
)
from gps_denied_onboard.replay_input import (
load_tlog_ground_truth,
)
except Exception as exc:
_LOGGER.warning(
"skipping accuracy report — imports failed: %r", exc
)
return None
emissions: list[dict[str, Any]] = []
for line in emissions_path.read_text().splitlines():
if not line.strip():
continue
emissions.append(json.loads(line))
if not emissions:
return None
gt_series = load_tlog_ground_truth(inputs.tlog_path).records
if not gt_series:
return None
ground_truth = [
GroundTruthRow(
t_s=fix.ts_ns / 1e9,
lat_deg=fix.lat_deg,
lon_deg=fix.lon_deg,
alt_m=fix.alt_m,
)
for fix in gt_series
]
distribution = horizontal_error_distribution(emissions, ground_truth)
if distribution.count == 0:
return None
try:
calibration_method = _calibration_acquisition_method(
inputs.calibration_path
)
except (OSError, ValueError):
calibration_method = "unknown"
clip_duration_s = (
ground_truth[-1].t_s - ground_truth[0].t_s
if len(ground_truth) > 1
else 0.0
)
context = ReportContext(
run_date_utc=datetime.utcnow().date().isoformat(),
tlog_path=inputs.tlog_path,
video_path=inputs.video_path,
calibration_acquisition_method=calibration_method,
clip_duration_s=clip_duration_s,
emissions_count=len(emissions),
)
passed = verdict_passes_ac3(distribution)
# Touch the threshold constant so a future rename surfaces here too.
assert AC3_GATE_THRESHOLD_M > 0.0
report_text = render_report(distribution, context, passed=passed)
report_path = output_dir / "accuracy_report.md"
report_path.write_text(report_text)
return report_path
def _maybe_render_map(
self,
inputs: ReplayInputs,
emissions_path: Path,
output_dir: Path,
report_path: Path | None,
) -> Path | None:
if not shutil.which(self._render_binary):
venv_bin = Path(sys.executable).parent / self._render_binary
if not venv_bin.exists():
_LOGGER.warning(
"%s not on PATH — skipping map render",
self._render_binary,
)
return None
render_bin = str(venv_bin)
else:
render_bin = self._render_binary
map_path = output_dir / "map.html"
argv = [
render_bin,
"--estimated",
str(emissions_path),
"--truth",
str(inputs.tlog_path),
"--output",
str(map_path),
]
if report_path is not None:
argv.extend(["--summary", str(report_path)])
completed = subprocess.run(
argv, capture_output=True, text=True, timeout=120
)
if completed.returncode != 0:
_LOGGER.warning(
"%s exited %s — map render skipped (stderr_tail=%r)",
self._render_binary,
completed.returncode,
completed.stderr[-2048:],
)
return None
return map_path
def _calibration_acquisition_method(calibration_path: Path) -> str:
import json
data = json.loads(calibration_path.read_text())
method = data.get("acquisition_method")
if isinstance(method, str) and method:
return method
return "unknown"
def build_runner_from_env() -> SubprocessReplayRunner:
return SubprocessReplayRunner(
replay_binary=os.environ.get(
"REPLAY_API_REPLAY_BINARY", "gps-denied-replay"
),
render_binary=os.environ.get(
"REPLAY_API_RENDER_BINARY", "gps-denied-render-map"
),
subprocess_timeout_s=float(
os.environ.get("REPLAY_API_SUBPROCESS_TIMEOUT_S", "900")
),
)
# ---------------------------------------------------------------------
# FastAPI app factory
def create_app(
*,
runner: ReplayRunner,
storage: StorageRoot,
registry: JobRegistry | None = None,
max_upload_bytes: int = 2 * 1024 * 1024 * 1024,
sync_max_bytes: int = 200 * 1024 * 1024,
) -> Any:
"""Build the FastAPI app.
Args:
runner: ``ReplayRunner`` injected into the registry.
storage: Per-job storage manager.
registry: Pre-built ``JobRegistry`` (the unit tests inject a
tuned one; production wiring builds one from env).
max_upload_bytes: hard limit per multipart upload (413 above).
sync_max_bytes: video size at which the API switches to async.
"""
try:
from typing import Annotated
from fastapi import (
FastAPI,
File,
Form,
Header,
HTTPException,
Request,
Response,
UploadFile,
)
from fastapi.responses import FileResponse, JSONResponse
except ImportError as exc:
raise SystemExit(
"FastAPI is not installed. Install with "
"`pip install gps-denied-onboard[operator-tools]`."
) from exc
if registry is None:
registry = JobRegistry(
runner=runner,
storage=storage,
max_concurrent=int(
os.environ.get("REPLAY_API_MAX_CONCURRENT_JOBS", "1")
),
max_queued=int(
os.environ.get("REPLAY_API_MAX_QUEUED_JOBS", "8")
),
)
@asynccontextmanager
async def lifespan(_: Any) -> AsyncIterator[None]:
if not auth_required():
_LOGGER.warning(
"REPLAY_API_AUTH_REQUIRED=false — bearer auth is DISABLED. "
"Do not run this in any environment exposed to the internet."
)
try:
yield
finally:
registry.shutdown(wait=False)
app = FastAPI(
title="gps-denied-onboard replay API",
version="1.0.0",
description=(
"HTTP wrapper around the offline `gps-denied-replay` "
"pipeline. Upload (tlog + video [+ calibration]); "
"receive GPS fixes + an accuracy report + an HTML map."
),
lifespan=lifespan,
)
@app.exception_handler(ReplayApiError)
async def _on_replay_api_error(
_request: Request, exc: ReplayApiError
) -> JSONResponse:
return JSONResponse(
status_code=exc.status_code,
content={
"error_code": exc.error_code,
"message": exc.message,
"details": exc.details,
},
)
def _check_auth(authorization: str | None) -> None:
if not auth_required():
return
expected = expected_bearer_token()
actual = extract_bearer_token(authorization)
if expected is None or actual != expected:
raise UnauthorizedError("bearer token does not match")
@app.get("/healthz")
async def healthz() -> dict[str, str]:
return {"status": "ok"}
@app.get("/readyz")
async def readyz() -> Response:
binary = os.environ.get(
"REPLAY_API_REPLAY_BINARY", "gps-denied-replay"
)
if shutil.which(binary) is None and not (
Path(sys.executable).parent / binary
).exists():
return JSONResponse(
status_code=503,
content={
"status": "not_ready",
"reason": f"{binary} not on PATH",
},
)
if not os.access(storage.root, os.W_OK):
return JSONResponse(
status_code=503,
content={
"status": "not_ready",
"reason": f"{storage.root} is not writable",
},
)
return JSONResponse(content={"status": "ok"})
@app.post("/replay")
async def post_replay(
tlog: Annotated[UploadFile, File()],
video: Annotated[UploadFile, File()],
calibration: Annotated[UploadFile | None, File()] = None,
pace: Annotated[str, Form()] = "asap",
auto_trim: Annotated[bool, Form()] = True,
authorization: Annotated[str | None, Header()] = None,
) -> Response:
_check_auth(authorization)
tlog_bytes = await tlog.read()
validate_upload_size(len(tlog_bytes), limit=max_upload_bytes)
validate_tlog_kind(tlog_bytes[:_PROBE_BYTES_MAX])
video_bytes = await video.read()
validate_upload_size(len(video_bytes), limit=max_upload_bytes)
validate_video_kind(video_bytes[:_PROBE_BYTES_MAX])
calibration_bytes: bytes | None = None
if calibration is not None:
calibration_bytes = await calibration.read()
validate_upload_size(
len(calibration_bytes), limit=max_upload_bytes
)
validate_calibration_kind(calibration_bytes[:_PROBE_BYTES_MAX])
# Allocate per-job storage and write the uploads.
job_id = _new_job_id()
job_storage = storage.allocate_job(job_id)
job_storage.tlog_path.write_bytes(tlog_bytes)
job_storage.video_path.write_bytes(video_bytes)
if calibration_bytes is not None:
job_storage.calibration_path.write_bytes(calibration_bytes)
elif _default_calibration_path() is not None:
shutil.copyfile(
_default_calibration_path(), # type: ignore[arg-type]
job_storage.calibration_path,
)
else:
raise MultipartMissingFieldError(
"calibration field is required (no default calibration "
"bundled with this build of replay_api)"
)
inputs = ReplayInputs(
tlog_path=job_storage.tlog_path,
video_path=job_storage.video_path,
calibration_path=job_storage.calibration_path,
pace=pace,
auto_trim=auto_trim,
)
# Submit under the pre-allocated job_id so the storage
# directory (already populated with the uploads above) and
# the API-visible job id match.
try:
snapshot = registry.submit(
inputs,
output_dir=job_storage.output_dir,
job_id=job_id,
)
except Exception:
storage.release_job(job_id)
raise
sync_mode = len(video_bytes) <= sync_max_bytes
if not sync_mode:
return JSONResponse(
status_code=202,
headers={"Location": f"/jobs/{snapshot.job_id}"},
content=_snapshot_to_dict(snapshot, sync=False),
)
# Wait for terminal state in sync mode.
snapshot = _await_terminal(registry, snapshot.job_id)
if snapshot.state == JobState.FAILED:
raise ReplayRunnerError(
snapshot.error or "replay runner failed without a message"
)
return JSONResponse(
status_code=200,
content=_snapshot_to_dict(snapshot, sync=True),
)
@app.get("/jobs/{job_id}")
async def get_job(
job_id: str,
authorization: Annotated[str | None, Header()] = None,
) -> dict[str, Any]:
_check_auth(authorization)
try:
snapshot = registry.get(job_id)
except JobNotFoundError:
raise HTTPException(
status_code=404,
detail={
"error_code": "job_not_found",
"message": f"job {job_id} not found",
},
)
return _snapshot_to_dict(snapshot, sync=False)
def _require_done(job_id: str) -> JobSnapshot:
snapshot = registry.get(job_id)
if snapshot.state != JobState.DONE:
raise JobNotCompleteError(
f"job {job_id} state is {snapshot.state.value}; "
"result is only available when state=done"
)
return snapshot
@app.get("/jobs/{job_id}/result")
async def get_result(
job_id: str,
authorization: Annotated[str | None, Header()] = None,
) -> Response:
_check_auth(authorization)
snapshot = _require_done(job_id)
if snapshot.result is None:
raise JobNotCompleteError("job done but no result attached")
return FileResponse(
path=snapshot.result.emissions_jsonl_path,
media_type="application/x-ndjson",
filename="emissions.jsonl",
)
@app.get("/jobs/{job_id}/map")
async def get_map(
job_id: str,
authorization: Annotated[str | None, Header()] = None,
) -> Response:
_check_auth(authorization)
snapshot = _require_done(job_id)
if snapshot.result is None or snapshot.result.map_html_path is None:
raise JobNotCompleteError("map artefact unavailable")
return FileResponse(
path=snapshot.result.map_html_path,
media_type="text/html",
filename="map.html",
)
@app.get("/jobs/{job_id}/report")
async def get_report(
job_id: str,
authorization: Annotated[str | None, Header()] = None,
) -> Response:
_check_auth(authorization)
snapshot = _require_done(job_id)
if (
snapshot.result is None
or snapshot.result.accuracy_report_md_path is None
):
raise JobNotCompleteError("report artefact unavailable")
return FileResponse(
path=snapshot.result.accuracy_report_md_path,
media_type="text/markdown",
filename="accuracy_report.md",
)
# Stash so unit tests can introspect.
app.state.registry = registry
app.state.storage = storage
# Silence unused-import lint on dependency types.
_ = (Form, File)
# Reference the unused-but-kept errors so a future renamed
# member surfaces here loudly.
_ = (
ConcurrencyLimitReachedError,
PayloadTooLargeError,
UnsupportedFileKindError,
MultipartMissingFieldError,
)
return app
# ---------------------------------------------------------------------
# Helpers
def _new_job_id() -> str:
import uuid
return uuid.uuid4().hex
def _default_calibration_path() -> Path | None:
raw = os.environ.get("REPLAY_API_DEFAULT_CALIBRATION")
if not raw:
return None
path = Path(raw)
if path.is_file():
return path
return None
def _await_terminal(registry: JobRegistry, job_id: str) -> JobSnapshot:
"""Block until ``job_id`` reaches a terminal state.
Used in sync mode. The registry runs jobs in its own thread pool;
we poll with a short backoff. The handler endpoint is async, so
blocking here parks the FastAPI worker — that's acceptable for
sync mode by design (sync mode is the small-file path).
"""
import time
deadline = time.monotonic() + 1800.0 # 30 min safety bound
while time.monotonic() < deadline:
snap = registry.get(job_id)
if snap.state in (JobState.DONE, JobState.FAILED):
return snap
time.sleep(0.05)
raise ReplayRunnerError("sync replay exceeded 30 min safety bound")
def _snapshot_to_dict(snapshot: JobSnapshot, *, sync: bool) -> dict[str, Any]:
payload: dict[str, Any] = {
"job_id": snapshot.job_id,
"state": snapshot.state.value,
"submitted_at_utc": snapshot.submitted_at_utc.isoformat(),
"started_at_utc": (
snapshot.started_at_utc.isoformat()
if snapshot.started_at_utc
else None
),
"finished_at_utc": (
snapshot.finished_at_utc.isoformat()
if snapshot.finished_at_utc
else None
),
"error": snapshot.error,
"status_url": f"/jobs/{snapshot.job_id}",
"sync": sync,
}
if snapshot.result is not None:
payload["emissions_jsonl_url"] = (
f"/jobs/{snapshot.job_id}/result"
)
if snapshot.result.accuracy_report_md_path is not None:
payload["accuracy_report_md_url"] = (
f"/jobs/{snapshot.job_id}/report"
)
if snapshot.result.map_html_path is not None:
payload["map_html_url"] = f"/jobs/{snapshot.job_id}/map"
return payload
@@ -0,0 +1,86 @@
"""AZ-701 — typed HTTP error families for the replay_api service.
Every error has a stable ``error_code`` (string) the contract pins
in ``_docs/02_document/contracts/replay_api/replay_api_protocol.md``.
The handler layer translates these into JSON responses; the
business layer raises them without knowing about HTTP.
"""
from __future__ import annotations
from typing import Any
__all__ = [
"ConcurrencyLimitReachedError",
"JobNotCompleteError",
"JobNotFoundError",
"MultipartMissingFieldError",
"PayloadTooLargeError",
"ReplayApiError",
"ReplayRunnerError",
"UnauthorizedError",
"UnsupportedFileKindError",
]
class ReplayApiError(Exception):
"""Base for every typed replay_api error.
Subclasses pin a stable ``error_code`` and HTTP ``status_code``;
the handler layer reads both to build a JSON response.
"""
error_code: str = "replay_api_error"
status_code: int = 500
def __init__(self, message: str, details: dict[str, Any] | None = None) -> None:
super().__init__(message)
self.message = message
self.details = details or {}
class UnsupportedFileKindError(ReplayApiError):
error_code = "unsupported_file_kind"
status_code = 400
class MultipartMissingFieldError(ReplayApiError):
error_code = "multipart_missing_field"
status_code = 400
class UnauthorizedError(ReplayApiError):
error_code = "unauthorized"
status_code = 401
class JobNotFoundError(ReplayApiError):
error_code = "job_not_found"
status_code = 404
class JobNotCompleteError(ReplayApiError):
error_code = "job_not_complete"
status_code = 409
class PayloadTooLargeError(ReplayApiError):
error_code = "payload_too_large"
status_code = 413
class ConcurrencyLimitReachedError(ReplayApiError):
"""Raised when the queue is full.
Note: per-spec, hitting just the running-job concurrency limit
does NOT raise this — those jobs queue normally. The 429 case is
"queue itself is full" only.
"""
error_code = "concurrency_limit_reached"
status_code = 429
class ReplayRunnerError(ReplayApiError):
error_code = "replay_runner_failed"
status_code = 500
@@ -0,0 +1,152 @@
"""AZ-701 — multipart upload + magic-byte validation + auth helpers.
The functions here are deliberately framework-light: they take raw
bytes / streams and return validated artefacts. ``app.py`` wires
them into FastAPI dependencies; unit tests call them directly.
"""
from __future__ import annotations
import logging
import os
from gps_denied_onboard.replay_api.errors import (
MultipartMissingFieldError,
PayloadTooLargeError,
UnauthorizedError,
UnsupportedFileKindError,
)
__all__ = [
"MIN_TLOG_PROBE_BYTES",
"MIN_VIDEO_PROBE_BYTES",
"auth_required",
"expected_bearer_token",
"extract_bearer_token",
"validate_calibration_kind",
"validate_tlog_kind",
"validate_upload_size",
"validate_video_kind",
]
_LOGGER = logging.getLogger("gps_denied_onboard.replay_api.handlers")
# MAVLink magic bytes — pymavlink uses 0xFD for v2.0 and 0xFE for
# v1.0. The Derkachi tlog is v2.0; we accept both because some
# operators ship v1.0 captures from older autopilots.
_MAVLINK_MAGIC_V2: int = 0xFD
_MAVLINK_MAGIC_V1: int = 0xFE
MIN_TLOG_PROBE_BYTES: int = 9
# mp4 boxes start with a 4-byte size, then 4 ASCII bytes for the
# box type. The first box in every valid mp4 is ``ftyp`` (per
# ISO/IEC 14496-12). ``"ftyp"`` lives at offset 4.
_MP4_FTYP_MARKER: bytes = b"ftyp"
MIN_VIDEO_PROBE_BYTES: int = 12
def validate_tlog_kind(probe_bytes: bytes) -> None:
"""Reject anything that doesn't open with a MAVLink magic byte.
pymavlink's tlog format prefixes each record with an 8-byte
big-endian microsecond timestamp followed by the raw MAVLink
frame, which always starts with the magic byte. So byte 8 of
any well-formed tlog is the MAVLink magic.
"""
if len(probe_bytes) < MIN_TLOG_PROBE_BYTES:
raise UnsupportedFileKindError(
f"tlog probe too small (need ≥ {MIN_TLOG_PROBE_BYTES} bytes "
f"to validate magic; got {len(probe_bytes)})"
)
magic = probe_bytes[8]
if magic not in (_MAVLINK_MAGIC_V2, _MAVLINK_MAGIC_V1):
raise UnsupportedFileKindError(
f"tlog magic byte 0x{magic:02X} at offset 8 is not "
f"MAVLink (expected 0x{_MAVLINK_MAGIC_V2:02X} or "
f"0x{_MAVLINK_MAGIC_V1:02X})"
)
def validate_video_kind(probe_bytes: bytes) -> None:
"""Reject anything that doesn't have an ``ftyp`` box at offset 4.
The size prefix at bytes 0-3 varies; the marker is the
discriminator. This catches the common "operator renamed
`.zip` to `.mp4`" attack — the AC-9 case.
"""
if len(probe_bytes) < MIN_VIDEO_PROBE_BYTES:
raise UnsupportedFileKindError(
f"video probe too small (need ≥ {MIN_VIDEO_PROBE_BYTES} "
f"bytes to validate ftyp; got {len(probe_bytes)})"
)
marker = probe_bytes[4:8]
if marker != _MP4_FTYP_MARKER:
raise UnsupportedFileKindError(
"video does not begin with an mp4 'ftyp' box at offset 4 "
f"(saw {marker!r})"
)
def validate_calibration_kind(probe_bytes: bytes) -> None:
"""Light JSON-shape check; the renderer is the strict validator."""
if not probe_bytes:
raise UnsupportedFileKindError("calibration upload is empty")
stripped = probe_bytes.lstrip()
if not stripped.startswith(b"{"):
raise UnsupportedFileKindError(
"calibration must be a JSON object (first non-whitespace "
"byte should be '{')"
)
def validate_upload_size(num_bytes: int, *, limit: int) -> None:
if num_bytes > limit:
raise PayloadTooLargeError(
f"upload size {num_bytes} exceeds REPLAY_API_MAX_UPLOAD_BYTES "
f"({limit})"
)
def expected_bearer_token() -> str | None:
"""Read the configured bearer token at request time.
Returning ``None`` means auth is disabled (
``REPLAY_API_AUTH_REQUIRED=false``); the caller is expected to
have logged the WARN once at service start.
"""
if not auth_required():
return None
token = os.environ.get("REPLAY_API_BEARER_TOKEN")
if not token:
raise UnauthorizedError(
"REPLAY_API_BEARER_TOKEN is not configured but auth is required"
)
return token
def auth_required() -> bool:
value = os.environ.get("REPLAY_API_AUTH_REQUIRED", "true").lower()
return value not in {"0", "false", "no", "off"}
def extract_bearer_token(header_value: str | None) -> str:
"""Parse ``Authorization: Bearer <token>`` strictly."""
if not header_value:
raise UnauthorizedError("missing Authorization header")
parts = header_value.split(" ", 1)
if len(parts) != 2 or parts[0].strip().lower() != "bearer":
raise UnauthorizedError(
"Authorization header must be 'Bearer <token>'"
)
token = parts[1].strip()
if not token:
raise UnauthorizedError("Authorization bearer token is empty")
return token
def _ensure_field(name: str, value: object) -> None:
if value is None:
raise MultipartMissingFieldError(f"missing multipart field: {name}")
@@ -0,0 +1,99 @@
"""AZ-701 — DTOs + ``ReplayRunner`` Protocol for the replay_api service.
The Protocol is the dependency-injection seam: ``handlers.py``
depends on the Protocol, not the concrete ``SubprocessReplayRunner``.
Unit tests inject a deterministic fake; the production wiring in
``app.py`` constructs the subprocess runner.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Protocol, runtime_checkable
__all__ = [
"JobSnapshot",
"JobState",
"ReplayInputs",
"ReplayJobResult",
"ReplayRunner",
]
class JobState(str, Enum):
"""Job lifecycle.
The state machine is monotonic: ``queued → running → done`` (or
``failed`` from any non-terminal state). No back-transitions.
"""
QUEUED = "queued"
RUNNING = "running"
DONE = "done"
FAILED = "failed"
@dataclass(frozen=True, slots=True)
class ReplayInputs:
"""The (tlog + video + calibration) bundle a runner consumes.
Storage paths are absolute. The handler builds these from a
per-job temp directory (see ``storage.py``).
``pace`` and ``auto_trim`` mirror the ``gps-denied-replay`` CLI
flags; the runner is responsible for translating them into argv.
"""
tlog_path: Path
video_path: Path
calibration_path: Path
pace: str = "asap"
auto_trim: bool = True
@dataclass(frozen=True, slots=True)
class ReplayJobResult:
"""The artefacts a finished job exposes.
Each path is absolute and lives under the per-job storage dir.
The handler layer maps these to URLs in the JSON response.
"""
emissions_jsonl_path: Path
accuracy_report_md_path: Path | None
map_html_path: Path | None
@dataclass(slots=True)
class JobSnapshot:
"""Serialisable snapshot of one job.
Mutable; the registry mutates the snapshot in-place under its
lock and yields copies to API readers.
"""
job_id: str
state: JobState
submitted_at_utc: datetime
started_at_utc: datetime | None = None
finished_at_utc: datetime | None = None
error: str | None = None
result: ReplayJobResult | None = None
sync: bool = False
extra: dict[str, str] = field(default_factory=dict)
@runtime_checkable
class ReplayRunner(Protocol):
"""Runs the offline replay pipeline for one job.
The Protocol is intentionally synchronous — the registry runs it
in a worker thread. Returning normally signals success; raising
any exception signals failure and the registry records the
stringified message on the job.
"""
def run(self, inputs: ReplayInputs, *, output_dir: Path) -> ReplayJobResult: ...
+233
View File
@@ -0,0 +1,233 @@
"""AZ-701 — in-memory job registry with a concurrency limit.
``JobRegistry`` is the single source of truth for job state. It is
intentionally simple — a dict plus a thread pool plus a queue cap.
Operators that need durable history persist the JSONL + Markdown
report + HTML map artefacts out-of-band (invariant 2 in the
contract).
Concurrency model:
- ``max_concurrent``: at most this many jobs may be in state
``RUNNING`` at once. Excess submissions land in state ``QUEUED``
and get promoted by the worker pool.
- ``max_queued``: hard cap on queued jobs. Exceeding it raises
``ConcurrencyLimitReachedError`` (HTTP 429 at the handler layer).
The registry runs jobs in a thread pool (``ThreadPoolExecutor``)
so the FastAPI event loop is never blocked. The runner is
intentionally synchronous (``ReplayRunner.run``) because the
underlying ``gps-denied-replay`` subprocess is synchronous.
"""
from __future__ import annotations
import logging
import threading
import uuid
from concurrent.futures import ThreadPoolExecutor
from copy import copy
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from gps_denied_onboard.replay_api.errors import (
ConcurrencyLimitReachedError,
JobNotFoundError,
)
from gps_denied_onboard.replay_api.interface import (
JobSnapshot,
JobState,
ReplayInputs,
ReplayRunner,
)
from gps_denied_onboard.replay_api.storage import StorageRoot
__all__ = ["ConcurrencyLimitReachedError", "JobRecord", "JobRegistry"]
_LOGGER = logging.getLogger("gps_denied_onboard.replay_api.jobs")
class JobRecord:
"""Internal mutable view of one job.
The registry exposes copies of ``snapshot`` to callers — never
the live object — so external code cannot corrupt state.
"""
__slots__ = ("inputs", "output_dir", "snapshot")
def __init__(
self, inputs: ReplayInputs, output_dir: Path, snapshot: JobSnapshot
) -> None:
self.inputs = inputs
self.output_dir = output_dir
self.snapshot = snapshot
class JobRegistry:
"""In-memory job pool + worker dispatch."""
def __init__(
self,
runner: ReplayRunner,
storage: StorageRoot,
*,
max_concurrent: int = 1,
max_queued: int = 8,
) -> None:
if max_concurrent < 1:
raise ValueError("max_concurrent must be ≥ 1")
if max_queued < 0:
raise ValueError("max_queued must be ≥ 0")
self._runner = runner
self._storage = storage
self._max_concurrent = max_concurrent
self._max_queued = max_queued
self._lock = threading.Lock()
self._records: dict[str, JobRecord] = {}
self._running_count = 0
self._executor = ThreadPoolExecutor(
max_workers=max_concurrent,
thread_name_prefix="replay-api-job",
)
@property
def max_concurrent(self) -> int:
return self._max_concurrent
def submit(
self,
inputs: ReplayInputs,
output_dir: Path,
*,
job_id: str | None = None,
) -> JobSnapshot:
"""Register a new job; return its initial snapshot.
``job_id`` is optional — when omitted the registry generates
a fresh uuid. The handler layer passes its own id so the
per-job storage directory and the API-visible job id match.
State at return time is:
- ``RUNNING`` if a worker slot is free.
- ``QUEUED`` otherwise (within ``max_queued``).
Raises ``ConcurrencyLimitReachedError`` when the queue is full.
"""
with self._lock:
queued_count = sum(
1
for r in self._records.values()
if r.snapshot.state == JobState.QUEUED
)
if (
self._running_count >= self._max_concurrent
and queued_count >= self._max_queued
):
raise ConcurrencyLimitReachedError(
f"queue full: running={self._running_count}, "
f"queued={queued_count}, max_queued={self._max_queued}"
)
if job_id is None:
job_id = uuid.uuid4().hex
if job_id in self._records:
raise ValueError(
f"duplicate job_id supplied to submit(): {job_id}"
)
state = (
JobState.RUNNING
if self._running_count < self._max_concurrent
else JobState.QUEUED
)
snapshot = JobSnapshot(
job_id=job_id,
state=state,
submitted_at_utc=_utc_now(),
started_at_utc=_utc_now() if state == JobState.RUNNING else None,
)
record = JobRecord(inputs=inputs, output_dir=output_dir, snapshot=snapshot)
self._records[job_id] = record
if state == JobState.RUNNING:
self._running_count += 1
self._executor.submit(self._run_or_wait, job_id)
with self._lock:
return copy(self._records[job_id].snapshot)
def get(self, job_id: str) -> JobSnapshot:
with self._lock:
record = self._records.get(job_id)
if record is None:
raise JobNotFoundError(f"job not found: {job_id}")
return copy(record.snapshot)
def list_ids(self) -> list[str]:
with self._lock:
return list(self._records)
def running_count(self) -> int:
with self._lock:
return self._running_count
def queued_count(self) -> int:
with self._lock:
return sum(
1
for r in self._records.values()
if r.snapshot.state == JobState.QUEUED
)
def shutdown(self, *, wait: bool = True) -> None:
self._executor.shutdown(wait=wait, cancel_futures=not wait)
self._storage.cleanup_all()
def _run_or_wait(self, job_id: str) -> None:
with self._lock:
record = self._records.get(job_id)
if record is None:
return
try:
if record.snapshot.state == JobState.QUEUED:
self._wait_for_slot(record)
self._execute(record)
except Exception as exc:
self._mark_failed(record, exc)
def _wait_for_slot(self, record: JobRecord) -> None:
while True:
with self._lock:
if self._running_count < self._max_concurrent:
record.snapshot.state = JobState.RUNNING
record.snapshot.started_at_utc = _utc_now()
self._running_count += 1
return
threading.Event().wait(0.05)
def _execute(self, record: JobRecord) -> None:
try:
result = self._runner.run(record.inputs, output_dir=record.output_dir)
with self._lock:
record.snapshot.state = JobState.DONE
record.snapshot.finished_at_utc = _utc_now()
record.snapshot.result = result
self._running_count = max(0, self._running_count - 1)
except Exception:
with self._lock:
self._running_count = max(0, self._running_count - 1)
raise
def _mark_failed(self, record: JobRecord, exc: BaseException) -> None:
message = f"{type(exc).__name__}: {exc}"
_LOGGER.exception("job %s failed", record.snapshot.job_id)
with self._lock:
record.snapshot.state = JobState.FAILED
record.snapshot.finished_at_utc = _utc_now()
record.snapshot.error = message
def _utc_now() -> datetime:
return datetime.now(timezone.utc)
# Re-export Any so type-checkers don't trim the local import.
_ = Any
@@ -0,0 +1,89 @@
"""AZ-701 — per-job temp-file lifecycle.
One ``StorageRoot`` rooted at ``REPLAY_API_STORAGE_ROOT``.
Each job allocates a subdirectory ``<root>/<job_id>/`` containing
the uploaded ``tlog`` + ``video`` + ``calibration`` plus the
estimator's outputs (``emissions.jsonl``, the AZ-699 report, the
AZ-700 map).
The directory is deleted on job completion (``release_job``) and on
service shutdown (``cleanup_all``). The service deliberately does
NOT keep finished-job artefacts forever — invariant 2 in the
contract.
"""
from __future__ import annotations
import logging
import shutil
from dataclasses import dataclass
from pathlib import Path
__all__ = ["JobStorage", "StorageRoot"]
_LOGGER = logging.getLogger("gps_denied_onboard.replay_api.storage")
@dataclass(frozen=True, slots=True)
class JobStorage:
"""The per-job paths the handler hands to the runner."""
root: Path
tlog_path: Path
video_path: Path
calibration_path: Path
output_dir: Path
class StorageRoot:
"""Parent of per-job storage directories.
The class is intentionally thin — the registry calls
``allocate_job`` at submit-time and ``release_job`` at terminal
transitions; nothing else owns mutation rights.
"""
def __init__(self, root: Path) -> None:
self._root = root
self._root.mkdir(parents=True, exist_ok=True)
@property
def root(self) -> Path:
return self._root
def allocate_job(self, job_id: str) -> JobStorage:
job_root = self._root / job_id
job_root.mkdir(parents=True, exist_ok=False)
output_dir = job_root / "output"
output_dir.mkdir(parents=True, exist_ok=True)
return JobStorage(
root=job_root,
tlog_path=job_root / "input.tlog",
video_path=job_root / "input.mp4",
calibration_path=job_root / "calibration.json",
output_dir=output_dir,
)
def release_job(self, job_id: str) -> None:
target = self._root / job_id
if not target.exists():
return
try:
shutil.rmtree(target)
except OSError as exc:
_LOGGER.warning(
"failed to delete per-job storage %s: %s", target, exc
)
def cleanup_all(self) -> None:
for child in self._root.iterdir():
if child.is_dir():
try:
shutil.rmtree(child)
except OSError as exc:
_LOGGER.warning(
"failed to delete per-job storage %s: %s",
child,
exc,
)