From be4cab4fcb2d1955ea8b37def952b6697a31032b Mon Sep 17 00:00:00 2001
From: Oleksandr Bezdieniezhnykh <oleksandr.bezdieniezhnykh@pwc.com>
Date: Wed, 1 Apr 2026 03:11:43 +0300
Subject: [PATCH] [AZ-178] Implement streaming video detection endpoint

- Added `/detect/video` endpoint for true streaming video detection, allowing inference to start as upload bytes arrive.
- Introduced `run_detect_video_stream` method in the inference module to handle video processing from a file-like object.
- Updated media hashing to include a new function for computing hashes directly from files with minimal I/O.
- Enhanced documentation to reflect changes in video processing and API behavior.

Made-with: Cursor
---
 .cursor/skills/test-run/SKILL.md              |   2 +-
 .cursor/skills/test-spec/SKILL.md             |  19 +-
 .../test-spec/templates/run-tests-script.md   |  31 +-
 .gitignore                                    |   3 +
 .../03_inference_pipeline/description.md      |  17 +-
 .../components/04_api/description.md          |  12 +-
 _docs/02_document/modules/inference.md        |  13 +-
 _docs/02_document/modules/main.md             |  30 +-
 _docs/02_document/modules/media_hash.md       |   4 +-
 _docs/02_document/modules/streaming_buffer.md |  82 ++++
 _docs/02_document/system-flows.md             | 256 ++++++++++-
 .../AZ-177_remove_redundant_video_prewrite.md |   0
 .../AZ-178_true_streaming_video_detect.md     | 107 +++++
 .../implementation_report_streaming_video.md  |  74 +++
 .../implementation_report_video_prewrite.md   |  31 ++
 _docs/04_deploy/ci_cd_pipeline.md             | 148 ++++++
 _docs/04_deploy/containerization.md           | 125 ++++++
 _docs/04_deploy/deploy_scripts.md             |  65 +++
 _docs/04_deploy/deployment_procedures.md      |  70 +++
 _docs/04_deploy/environment_strategy.md       |  42 ++
 _docs/04_deploy/observability.md              |  69 +++
 .../04_deploy/reports/deploy_status_report.md |  56 +++
 _docs/05_security/dependency_scan.md          |  56 +++
 _docs/05_security/infrastructure_review.md    |  79 ++++
 _docs/05_security/owasp_review.md             |  67 +++
 _docs/05_security/security_report.md          | 140 ++++++
 _docs/05_security/static_analysis.md          |  72 +++
 _docs/_autopilot_state.md                     |   6 +-
 e2e/requirements.txt                          |   1 +
 e2e/tests/test_streaming_video_upload.py      | 249 ++++++++++
 run-tests.sh                                  |   3 +
 scripts/deploy.sh                             |  82 ++++
 scripts/health-check.sh                       |  39 ++
 scripts/pull-images.sh                        |  36 ++
 scripts/start-services.sh                     |  46 ++
 scripts/stop-services.sh                      |  38 ++
 src/inference.pyx                             |  19 +
 src/main.py                                   | 132 ++++++
 src/media_hash.py                             |  20 +
 src/streaming_buffer.py                       |  93 ++++
 tests/test_az178_realvideo_streaming.py       | 153 +++++++
 tests/test_az178_streaming_video.py           | 425 ++++++++++++++++++
 42 files changed, 2983 insertions(+), 29 deletions(-)
 create mode 100644 _docs/02_document/modules/streaming_buffer.md
 rename _docs/02_tasks/{todo => done}/AZ-177_remove_redundant_video_prewrite.md (100%)
 create mode 100644 _docs/02_tasks/todo/AZ-178_true_streaming_video_detect.md
 create mode 100644 _docs/03_implementation/implementation_report_streaming_video.md
 create mode 100644 _docs/03_implementation/implementation_report_video_prewrite.md
 create mode 100644 _docs/04_deploy/ci_cd_pipeline.md
 create mode 100644 _docs/04_deploy/containerization.md
 create mode 100644 _docs/04_deploy/deploy_scripts.md
 create mode 100644 _docs/04_deploy/deployment_procedures.md
 create mode 100644 _docs/04_deploy/environment_strategy.md
 create mode 100644 _docs/04_deploy/observability.md
 create mode 100644 _docs/04_deploy/reports/deploy_status_report.md
 create mode 100644 _docs/05_security/dependency_scan.md
 create mode 100644 _docs/05_security/infrastructure_review.md
 create mode 100644 _docs/05_security/owasp_review.md
 create mode 100644 _docs/05_security/security_report.md
 create mode 100644 _docs/05_security/static_analysis.md
 create mode 100644 e2e/tests/test_streaming_video_upload.py
 create mode 100755 scripts/deploy.sh
 create mode 100755 scripts/health-check.sh
 create mode 100755 scripts/pull-images.sh
 create mode 100755 scripts/start-services.sh
 create mode 100755 scripts/stop-services.sh
 create mode 100644 src/streaming_buffer.py
 create mode 100644 tests/test_az178_realvideo_streaming.py
 create mode 100644 tests/test_az178_streaming_video.py

diff --git a/.cursor/skills/test-run/SKILL.md b/.cursor/skills/test-run/SKILL.md
index af2b782..5b18693 100644
--- a/.cursor/skills/test-run/SKILL.md
+++ b/.cursor/skills/test-run/SKILL.md
@@ -56,7 +56,7 @@ Present a summary:
 ══════════════════════════════════════
 ```
 
-**Important**: Collection errors (import failures, missing dependencies, syntax errors) count as failures — they are not "skipped" or ignorable.
+**Important**: Collection errors (import failures, missing dependencies, syntax errors) count as failures — they are not "skipped" or ignorable. If a collection error is caused by a missing dependency, install it (add to the project's dependency file and install) before re-running. The test runner script (`run-tests.sh`) should install all dependencies automatically — if it doesn't, fix the script to do so.
 
 ### 4. Diagnose Failures and Skips
 
diff --git a/.cursor/skills/test-spec/SKILL.md b/.cursor/skills/test-spec/SKILL.md
index 04132f6..9d28556 100644
--- a/.cursor/skills/test-spec/SKILL.md
+++ b/.cursor/skills/test-spec/SKILL.md
@@ -435,17 +435,20 @@ Write or update a **"Test Execution"** section in `TESTS_OUTPUT_DIR/environment.
 3. Identify performance/load testing tools from dependencies (k6, locust, artillery, wrk, or built-in benchmarks)
 4. Read `TESTS_OUTPUT_DIR/environment.md` for infrastructure requirements
 
-#### Step 2 — Generate `scripts/run-tests.sh`
+#### Step 2 — Generate test runner
 
-Create `scripts/run-tests.sh` at the project root using `.cursor/skills/test-spec/templates/run-tests-script.md` as structural guidance. The script must:
+**Docker is the default.** Only generate a local `scripts/run-tests.sh` if the Hardware-Dependency Assessment determined **local** or **both** execution (i.e., the project requires real hardware like GPU/CoreML/TPU/sensors). For all other projects, use `docker-compose.test.yml` — it provides reproducibility, isolation, and CI parity without a custom shell script.
+
+**If local script is needed** — create `scripts/run-tests.sh` at the project root using `.cursor/skills/test-spec/templates/run-tests-script.md` as structural guidance. The script must:
 
 1. Set `set -euo pipefail` and trap cleanup on EXIT
-2. Optionally accept a `--unit-only` flag to skip blackbox tests
-3. Run unit/blackbox tests using the detected test runner:
-   - **Local mode**: activate virtualenv (if present), run test runner directly on host
-   - **Docker mode**: spin up docker-compose environment, wait for health checks, run test suite, tear down
-4. Print a summary of passed/failed/skipped tests
-5. Exit 0 on all pass, exit 1 on any failure
+2. **Install all project and test dependencies** (e.g. `pip install -q -r requirements.txt -r e2e/requirements.txt`, `dotnet restore`, `npm ci`). This prevents collection-time import errors on fresh environments.
+3. Optionally accept a `--unit-only` flag to skip blackbox tests
+4. Run unit/blackbox tests using the detected test runner (activate virtualenv if present, run test runner directly on host)
+5. Print a summary of passed/failed/skipped tests
+6. Exit 0 on all pass, exit 1 on any failure
+
+**If Docker** — generate or update `docker-compose.test.yml` that builds the test image, installs all dependencies inside the container, runs the test suite, and exits with the test runner's exit code.
 
 #### Step 3 — Generate `scripts/run-performance-tests.sh`
 
diff --git a/.cursor/skills/test-spec/templates/run-tests-script.md b/.cursor/skills/test-spec/templates/run-tests-script.md
index e5c41ff..76a6de1 100644
--- a/.cursor/skills/test-spec/templates/run-tests-script.md
+++ b/.cursor/skills/test-spec/templates/run-tests-script.md
@@ -2,7 +2,13 @@
 
 Reference for generating `scripts/run-tests.sh` and `scripts/run-performance-tests.sh`.
 
-## `scripts/run-tests.sh`
+## When to generate a local `run-tests.sh`
+
+A local shell script is needed **only** for hardware-dependent projects that require real hardware (GPU, CoreML, TPU, sensors, etc.) to exercise the actual code paths. If the Hardware-Dependency Assessment (Phase 4 prerequisite) determined **local** or **both** execution, generate this script.
+
+For all other projects, **use Docker** (`docker-compose.test.yml` / `Dockerfile.test`). Docker is the default — it provides reproducibility, isolation, and CI parity. Do not generate a local `run-tests.sh` when Docker is sufficient.
+
+## `scripts/run-tests.sh` (local / hardware-dependent only)
 
 ```bash
 #!/usr/bin/env bash
@@ -20,23 +26,33 @@ for arg in "$@"; do
 done
 
 cleanup() {
-  # tear down docker-compose if it was started
+  # tear down services started by this script
 }
 trap cleanup EXIT
 
 mkdir -p "$RESULTS_DIR"
 
+# --- Install Dependencies ---
+# MANDATORY: install all project + test dependencies before building or running.
+# A fresh clone or CI runner may have nothing installed.
+# Python:  pip install -q -r requirements.txt -r e2e/requirements.txt
+# .NET:    dotnet restore
+# Rust:    cargo fetch
+# Node:    npm ci
+
+# --- Build (if needed) ---
+# [e.g. Cython: python setup.py build_ext --inplace]
+
 # --- Unit Tests ---
 # [detect runner: pytest / dotnet test / cargo test / npm test]
 # [run and capture exit code]
-# [save results to $RESULTS_DIR/unit-results.*]
 
 # --- Blackbox Tests (skip if --unit-only) ---
 # if ! $UNIT_ONLY; then
-#   [docker compose -f <compose-file> up -d]
+#   [start mock services]
+#   [start system under test]
 #   [wait for health checks]
 #   [run blackbox test suite]
-#   [save results to $RESULTS_DIR/blackbox-results.*]
 # fi
 
 # --- Summary ---
@@ -61,6 +77,9 @@ trap cleanup EXIT
 
 mkdir -p "$RESULTS_DIR"
 
+# --- Install Dependencies ---
+# [same as above — always install first]
+
 # --- Start System Under Test ---
 # [docker compose up -d or start local server]
 # [wait for health checks]
@@ -80,6 +99,8 @@ mkdir -p "$RESULTS_DIR"
 
 ## Key Requirements
 
+- **Docker is the default**: only generate a local `run-tests.sh` for hardware-dependent projects. Otherwise use `docker-compose.test.yml`.
+- **Always install dependencies first**: the script must install all project and test dependencies before building or running tests. A fresh clone or CI runner may have nothing installed. Missing a single dependency causes collection errors that abort the entire test run.
 - Both scripts must be idempotent (safe to run multiple times)
 - Both scripts must work in CI (no interactive prompts, no GUI)
 - Use `trap cleanup EXIT` to ensure teardown even on failure
diff --git a/.gitignore b/.gitignore
index cb4d99e..846a465 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,5 +63,8 @@ e2e/logs/
 !e2e/results/.gitkeep
 !e2e/logs/.gitkeep
 
+# Deployment state
+.deploy-previous-tag
+
 # Runtime logs
 Logs/
diff --git a/_docs/02_document/components/03_inference_pipeline/description.md b/_docs/02_document/components/03_inference_pipeline/description.md
index 2a0dce9..4b7c16c 100644
--- a/_docs/02_document/components/03_inference_pipeline/description.md
+++ b/_docs/02_document/components/03_inference_pipeline/description.md
@@ -28,6 +28,8 @@ cdef class Inference:
                            annotation_callback, status_callback=None)
     cpdef run_detect_video(bytes video_bytes, AIRecognitionConfig ai_config, str media_name,
                            str save_path, annotation_callback, status_callback=None)
+    cpdef run_detect_video_stream(object readable, AIRecognitionConfig ai_config, str media_name,
+                                  annotation_callback, status_callback=None)
     cpdef stop()
 
     # Internal pipeline stages:
@@ -60,6 +62,7 @@ class LoaderHttpClient:
 
 ```
 def compute_media_content_hash(data: bytes, virtual: bool = False) -> str
+def compute_media_content_hash_from_file(path: str, virtual: bool = False) -> str
 ```
 
 ## External API
@@ -70,9 +73,10 @@ None — internal component, consumed by API layer.
 
 - Model bytes downloaded from Loader service (HTTP)
 - Converted TensorRT engines uploaded back to Loader for caching
-- Video frames decoded from in-memory bytes via PyAV (`av.open(BytesIO)`)
+- Video frames decoded from in-memory bytes via PyAV (`av.open(BytesIO)`) — `run_detect_video`
+- Video frames decoded from streaming file-like via PyAV (`av.open(readable)`) — `run_detect_video_stream` (AZ-178)
 - Images decoded from in-memory bytes via `cv2.imdecode`
-- Video bytes concurrently written to persistent storage path in background thread
+- Video bytes concurrently written to persistent storage path in background thread (`run_detect_video`) or via StreamingBuffer (`run_detect_video_stream`)
 - All inference processing is in-memory
 
 ## Implementation Details
@@ -119,6 +123,15 @@ None — internal component, consumed by API layer.
 - Annotation validity heuristics: time gap, detection count increase, spatial movement, confidence improvement
 - JPEG encoding of valid frames for annotation images
 
+### Streaming Video Processing (AZ-178)
+
+- `run_detect_video_stream` accepts a file-like `readable` (e.g. `StreamingBuffer`) instead of `bytes`
+- Opens `av.open(readable)` directly — PyAV calls `read()`/`seek()` on the object as needed
+- No writer thread — the `StreamingBuffer` already persists data to disk as the HTTP handler feeds it chunks
+- Reuses `_process_video_pyav` for all frame decoding, batching, and annotation logic
+- For faststart MP4/MKV/WebM: true streaming (~500ms to first frame)
+- For standard MP4 (moov at end): graceful degradation via blocking SEEK_END
+
 ### Callbacks
 
 - `annotation_callback(annotation, percent)` — called per valid annotation
diff --git a/_docs/02_document/components/04_api/description.md b/_docs/02_document/components/04_api/description.md
index 1629cfc..25e14b7 100644
--- a/_docs/02_document/components/04_api/description.md
+++ b/_docs/02_document/components/04_api/description.md
@@ -14,6 +14,7 @@
 | Module | Role |
 |--------|------|
 | `main` | FastAPI app definition, endpoints, DTOs, TokenManager, SSE streaming, media lifecycle, DB-driven config resolution |
+| `streaming_buffer` | File-like object for concurrent write+read — enables true streaming video detection (AZ-178) |
 
 ## External API Specification
 
@@ -50,6 +51,13 @@
 **Behavior** (AZ-173, AZ-175): Accepts both images and videos. Detects upload kind by extension, falls back to content probing. If authenticated: computes content hash, persists to storage, creates media record, tracks status lifecycle (New → AI Processing → AI Processed / Error).
 **Errors**: 400 (empty/invalid image data), 422 (runtime error), 503 (engine unavailable).
 
+### POST /detect/video
+
+**Input**: Raw binary body (not multipart). Headers: `X-Filename` (e.g. `clip.mp4`), optional `X-Config` (JSON), optional `Authorization: Bearer {token}`, optional `X-Refresh-Token`.
+**Response**: `{"status": "started", "mediaId": "..."}`
+**Behavior** (AZ-178): True streaming video detection. Bypasses Starlette multipart buffering by accepting raw body via `request.stream()`. Creates a `StreamingBuffer` (temp file), starts inference thread immediately, feeds HTTP chunks to the buffer as they arrive. PyAV reads from the buffer concurrently, decoding frames and running inference. Detections broadcast via SSE in real-time during upload. After upload: computes content hash from file (3 KB I/O), renames to permanent path, creates media record if authenticated.
+**Errors**: 400 (non-video extension).
+
 ### POST /detect/{media_id}
 
 **Input**: Path param `media_id`, optional JSON body `AIConfigDto`, headers `Authorization: Bearer {token}`, `x-refresh-token: {token}`.
@@ -82,7 +90,8 @@ data: {"annotations": [...], "mediaId": "...", "mediaStatus": "AIProcessing", "m
 - `TokenManager.decode_user_id` extracts user identity from multiple JWT claim formats (sub, userId, nameid, SAML)
 - DB-driven config via `_resolve_media_for_detect`: fetches AI settings from Annotations, merges nested sections and casing variants
 - Media lifecycle: `_post_media_record` + `_put_media_status` manage status transitions via Annotations API
-- Content hashing via `compute_media_content_hash` (XxHash64 with sampling) for media deduplication
+- Content hashing via `compute_media_content_hash` (bytes, XxHash64 with sampling) and `compute_media_content_hash_from_file` (file on disk, 3 KB I/O) for media deduplication
+- `StreamingBuffer` for `/detect/video`: concurrent file append + read via `threading.Condition`, enables PyAV to decode frames as HTTP chunks arrive
 
 ## Caveats
 
@@ -103,6 +112,7 @@ graph TD
     main --> constants_inf
     main --> loader_http_client
     main --> media_hash
+    main --> streaming_buffer
 ```
 
 ## Logging Strategy
diff --git a/_docs/02_document/modules/inference.md b/_docs/02_document/modules/inference.md
index db2f803..ec4f29b 100644
--- a/_docs/02_document/modules/inference.md
+++ b/_docs/02_document/modules/inference.md
@@ -39,6 +39,7 @@ Core inference orchestrator — manages the AI engine lifecycle, preprocesses me
 | `__init__` | `(loader_client)` | public | Initializes state, calls `init_ai()` |
 | `run_detect_image` | `(bytes image_bytes, AIRecognitionConfig ai_config, str media_name, annotation_callback, status_callback=None)` | cpdef | Decodes image from bytes, runs tiling + inference + postprocessing |
 | `run_detect_video` | `(bytes video_bytes, AIRecognitionConfig ai_config, str media_name, str save_path, annotation_callback, status_callback=None)` | cpdef | Processes video from in-memory bytes via PyAV, concurrently writes to save_path |
+| `run_detect_video_stream` | `(object readable, AIRecognitionConfig ai_config, str media_name, annotation_callback, status_callback=None)` | cpdef | Processes video from a file-like readable (e.g. StreamingBuffer) via PyAV — true streaming, no bytes in RAM (AZ-178) |
 | `stop` | `()` | cpdef | Sets stop_signal to True |
 | `init_ai` | `()` | cdef | Engine initialization: tries TensorRT → falls back to ONNX → background TensorRT conversion |
 | `preprocess` | `(frames) -> ndarray` | via engine | OpenCV blobFromImage: resize, normalize to 0..1, swap RGB, stack batch |
@@ -75,6 +76,15 @@ Both `run_detect_image` and `run_detect_video` accept raw bytes instead of file
 6. Annotation validity heuristics (time gap, detection count increase, spatial movement, confidence improvement)
 7. Valid frames get JPEG-encoded image attached
 
+### Streaming Video Processing (`run_detect_video_stream` — AZ-178)
+
+1. Accepts a file-like `readable` object (e.g. `StreamingBuffer`) instead of `bytes`
+2. Opens directly via `av.open(readable)` — PyAV calls `read()`/`seek()` on the object
+3. No writer thread needed — the caller (API layer) manages disk persistence via the same buffer
+4. Reuses `_process_video_pyav` for frame decoding, batch inference, and annotation delivery
+5. For faststart MP4/MKV/WebM: frames are decoded as bytes stream in (~500ms latency)
+6. For standard MP4 (moov at end): PyAV's `seek(0, 2)` blocks until the buffer signals EOF, then decoding starts
+
 ### Ground Sampling Distance (GSD)
 
 `GSD = sensor_width * altitude / (focal_length * image_width)` — meters per pixel, used for physical size filtering of aerial detections.
@@ -86,7 +96,7 @@ Both `run_detect_image` and `run_detect_video` accept raw bytes instead of file
 
 ## Consumers
 
-- `main` — lazy-initializes Inference, calls `run_detect_image`/`run_detect_video`, reads `ai_availability_status` and `is_engine_ready`
+- `main` — lazy-initializes Inference, calls `run_detect_image`/`run_detect_video`/`run_detect_video_stream`, reads `ai_availability_status` and `is_engine_ready`
 
 ## Data Models
 
@@ -107,5 +117,6 @@ None.
 ## Tests
 
 - `tests/test_ai_config_from_dict.py` — tests `ai_config_from_dict` helper
+- `tests/test_az178_streaming_video.py` — tests `run_detect_video_stream` via the `/detect/video` endpoint and `StreamingBuffer`
 - `e2e/tests/test_video.py` — exercises `run_detect_video` via the full API
 - `e2e/tests/test_single_image.py` — exercises `run_detect_image` via the full API
diff --git a/_docs/02_document/modules/main.md b/_docs/02_document/modules/main.md
index 4bb392b..1c4f4e5 100644
--- a/_docs/02_document/modules/main.md
+++ b/_docs/02_document/modules/main.md
@@ -11,7 +11,8 @@ FastAPI application entry point — exposes HTTP API for object detection on ima
 | Method | Path | Description |
 |--------|------|-------------|
 | GET | `/health` | Returns AI engine availability status |
-| POST | `/detect` | Image/video detection with media lifecycle management |
+| POST | `/detect` | Image/video detection with media lifecycle management (buffered) |
+| POST | `/detect/video` | Streaming video detection — inference starts as upload bytes arrive (AZ-178) |
 | POST | `/detect/{media_id}` | Start async detection on media resolved from Annotations service |
 | GET | `/detect/stream` | SSE stream of detection events |
 
@@ -41,7 +42,8 @@ FastAPI application entry point — exposes HTTP API for object detection on ima
 | `_detect_upload_kind` | `(filename, data) -> tuple[str, str]` | Determines if upload is image or video by extension, falls back to content probing (cv2/PyAV) |
 | `_post_media_record` | `(payload, bearer) -> bool` | Creates media record via `POST /api/media` on Annotations service |
 | `_put_media_status` | `(media_id, status, bearer) -> bool` | Updates media status via `PUT /api/media/{media_id}/status` on Annotations service |
-| `compute_media_content_hash` | (imported from `media_hash`) | XxHash64 content hash with sampling |
+| `compute_media_content_hash` | (imported from `media_hash`) | XxHash64 content hash with sampling (from bytes) |
+| `compute_media_content_hash_from_file` | (imported from `media_hash`) | XxHash64 content hash from file on disk — reads only 3 KB |
 
 ## Internal Logic
 
@@ -57,9 +59,10 @@ Returns `HealthResponse` with `status="healthy"` always. `aiAvailability` reflec
 4. Parses optional JSON config
 5. Extracts auth tokens; if authenticated:
    a. Computes XxHash64 content hash
-   b. Persists file to `VIDEOS_DIR` or `IMAGES_DIR`
-   c. Creates media record via `POST /api/media`
-   d. Sets status to `AI_PROCESSING` via `PUT /api/media/{id}/status`
+   b. For images: persists file to `IMAGES_DIR` synchronously (since `run_detect_image` does not write to disk)
+   c. For videos: file path is prepared but writing is deferred to `run_detect_video` which writes concurrently with frame detection (AZ-177)
+   d. Creates media record via `POST /api/media`
+   e. Sets status to `AI_PROCESSING` via `PUT /api/media/{id}/status`
 6. Runs `run_detect_image` or `run_detect_video` in ThreadPoolExecutor
 7. On success: sets status to `AI_PROCESSED`
 8. On failure: sets status to `ERROR`
@@ -80,6 +83,20 @@ Returns `HealthResponse` with `status="healthy"` always. `aiAvailability` reflec
 8. Updates media status via `PUT /api/media/{id}/status`
 9. Returns immediately: `{"status": "started", "mediaId": media_id}`
 
+### `/detect/video` (streaming upload — AZ-178)
+
+1. Parses `X-Filename`, `X-Config`, auth headers (no multipart — raw binary body)
+2. Validates video extension
+3. Creates `StreamingBuffer` backed by a temp file in `VIDEOS_DIR`
+4. Starts inference thread via `run_in_executor`: `run_detect_video_stream(buffer, ...)`
+5. Reads HTTP body chunks via `request.stream()`, feeds each to `buffer.append()` via executor
+6. Inference thread reads from the same buffer concurrently — PyAV decodes frames as data arrives
+7. Detections are broadcast to SSE queues in real-time during upload
+8. After upload completes: signals EOF, computes content hash from temp file (3 KB read), renames to permanent path
+9. If authenticated: creates media record, tracks status lifecycle
+10. Returns `{"status": "started", "mediaId": "..."}` — inference continues in background task
+11. Background task awaits inference completion, updates status to AI_PROCESSED or Error
+
 ### `/detect/stream` (SSE)
 
 - Creates asyncio.Queue per client (maxsize=100)
@@ -99,7 +116,7 @@ Detections posts results to `POST {ANNOTATIONS_URL}/annotations` during async me
 ## Dependencies
 
 - **External**: `asyncio`, `base64`, `io`, `json`, `os`, `tempfile`, `time`, `concurrent.futures`, `pathlib`, `typing`, `av`, `cv2`, `numpy`, `requests`, `fastapi`, `pydantic`
-- **Internal**: `inference` (lazy import), `constants_inf` (label lookup), `loader_http_client` (client instantiation), `media_hash` (content hashing)
+- **Internal**: `inference` (lazy import), `constants_inf` (label lookup), `loader_http_client` (client instantiation), `media_hash` (content hashing), `streaming_buffer` (streaming video upload)
 
 ## Consumers
 
@@ -141,4 +158,5 @@ None (entry point).
 
 - `tests/test_az174_db_driven_config.py` — `decode_user_id`, `_merged_annotation_settings_payload`, `_resolve_media_for_detect`
 - `tests/test_az175_api_calls.py` — `_post_media_record`, `_put_media_status`
+- `tests/test_az177_video_single_write.py` — video single-write, image unchanged, concurrent writer thread, temp cleanup
 - `e2e/tests/test_*.py` — full API e2e tests (health, single image, video, async, SSE, negative, security, performance, resilience)
diff --git a/_docs/02_document/modules/media_hash.md b/_docs/02_document/modules/media_hash.md
index 6eb9559..917a06d 100644
--- a/_docs/02_document/modules/media_hash.md
+++ b/_docs/02_document/modules/media_hash.md
@@ -9,6 +9,7 @@ Content-based hashing for media files using XxHash64 with a deterministic sampli
 | Function | Signature | Description |
 |----------|-----------|-------------|
 | `compute_media_content_hash` | `(data: bytes, virtual: bool = False) -> str` | Returns hex XxHash64 digest of sampled content. If `virtual=True`, prefixes with "V". |
+| `compute_media_content_hash_from_file` | `(path: str, virtual: bool = False) -> str` | Same algorithm but reads sampling regions directly from a file on disk — only 3 KB I/O regardless of file size. Produces identical hashes to the bytes-based version. (AZ-178) |
 
 ## Internal Logic
 
@@ -27,7 +28,7 @@ The sampling avoids reading the full file through the hash function while still
 
 ## Consumers
 
-- `main` — computes content hash for uploaded media in `POST /detect` to use as the media record ID and storage filename
+- `main` — computes content hash for uploaded media in `POST /detect` (bytes version) and `POST /detect/video` (file version) to use as the media record ID and storage filename
 
 ## Data Models
 
@@ -48,3 +49,4 @@ None. The hash is non-cryptographic (fast, not tamper-resistant).
 ## Tests
 
 - `tests/test_media_hash.py` — covers small files, large files, and virtual prefix behavior
+- `tests/test_az178_streaming_video.py::TestMediaContentHashFromFile` — verifies file-based hash matches bytes-based hash for small, large, boundary, and virtual cases
diff --git a/_docs/02_document/modules/streaming_buffer.md b/_docs/02_document/modules/streaming_buffer.md
new file mode 100644
index 0000000..8e0d5d9
--- /dev/null
+++ b/_docs/02_document/modules/streaming_buffer.md
@@ -0,0 +1,82 @@
+# Module: streaming_buffer
+
+## Purpose
+
+File-like object backed by a temp file that supports concurrent append (write) and read+seek (read) from separate threads. Designed for true streaming video detection: the HTTP handler appends incoming chunks while the inference thread reads and decodes frames via PyAV — simultaneously, without buffering the entire file in memory.
+
+## Public Interface
+
+### Class: StreamingBuffer
+
+| Method | Signature | Description |
+|--------|-----------|-------------|
+| `__init__` | `(temp_dir: str \| None = None)` | Creates a temp file in `temp_dir`; opens separate write and read handles |
+| `append` | `(data: bytes) -> None` | Writes data to temp file, flushes, notifies waiting readers |
+| `close_writer` | `() -> None` | Signals EOF — wakes all blocked readers |
+| `read` | `(size: int = -1) -> bytes` | Reads up to `size` bytes; blocks if data not yet available; returns `b""` on EOF |
+| `seek` | `(offset: int, whence: int = 0) -> int` | Seeks reader position; SEEK_END blocks until EOF is signaled |
+| `tell` | `() -> int` | Returns current reader position |
+| `readable` | `() -> bool` | Always returns `True` |
+| `seekable` | `() -> bool` | Always returns `True` |
+| `writable` | `() -> bool` | Always returns `False` |
+| `close` | `() -> None` | Closes both file handles |
+
+### Properties
+
+| Property | Type | Description |
+|----------|------|-------------|
+| `path` | `str` | Absolute path to the backing temp file |
+| `written` | `int` | Total bytes appended so far |
+
+## Internal Logic
+
+### Thread Coordination
+
+Uses `threading.Condition` to synchronize one writer (HTTP handler) and one reader (PyAV/inference thread):
+
+- **append()**: acquires lock → writes to file → flushes → increments `_written` → `notify_all()` → releases lock
+- **read(size)**: acquires lock → checks if data available → if not and not EOF, calls `wait()` (releases lock, sleeps) → woken by `notify_all()` → calculates bytes to read → releases lock → reads from file (outside lock)
+- **seek(0, 2)** (SEEK_END): acquires lock → if EOF not signaled, calls `wait()` in loop → once EOF, delegates to `_reader.seek(offset, 2)`
+
+The file read itself happens **outside** the lock to avoid holding the lock during I/O.
+
+### File Handle Separation
+
+Two independent file descriptors on the same temp file:
+- `_writer` opened with `"wb"` — append-only, used by the HTTP handler
+- `_reader` opened with `"rb"` — seekable, used by PyAV
+
+On POSIX systems, writes flushed by one fd are immediately visible to reads on another fd of the same inode (shared kernel page cache). `os.rename()` on the path while the reader fd is open is safe — the fd retains access to the underlying inode.
+
+### SEEK_END Behavior
+
+When PyAV tries to seek to the end of the file (e.g. to find MP4 moov atom), `seek(0, 2)` blocks until `close_writer()` is called. This provides graceful degradation for non-faststart MP4 files: the decoder waits for the full upload, then processes normally. For faststart MP4/MKV/WebM, SEEK_END is never called and frames are decoded immediately.
+
+## Dependencies
+
+- **External**: `os`, `tempfile`, `threading`
+- **Internal**: none (leaf module)
+
+## Consumers
+
+- `main` — creates `StreamingBuffer` in `POST /detect/video`, feeds chunks via `append()`, passes buffer to inference
+
+## Data Models
+
+None.
+
+## Configuration
+
+None.
+
+## External Integrations
+
+None.
+
+## Security
+
+None. Temp file permissions follow OS defaults (`tempfile.mkstemp`).
+
+## Tests
+
+- `tests/test_az178_streaming_video.py::TestStreamingBuffer` — sequential write/read, blocking read, EOF, concurrent chunked read/write, seek set, seek end blocking, tell, file persistence, written property, seekable/readable flags
diff --git a/_docs/02_document/system-flows.md b/_docs/02_document/system-flows.md
index 930f5e3..0ed0704 100644
--- a/_docs/02_document/system-flows.md
+++ b/_docs/02_document/system-flows.md
@@ -10,6 +10,7 @@
 | F4 | SSE Event Streaming | Client GET /detect/stream | API | Medium |
 | F5 | Engine Initialization | First detection request | Inference Pipeline, Engines, Loader | High |
 | F6 | TensorRT Background Conversion | No pre-built TensorRT engine | Inference Pipeline, Engines, Loader | Medium |
+| F7 | Streaming Video Detection | Client POST /detect/video | API, StreamingBuffer, Inference Pipeline, Engines, Domain, Annotations | High |
 
 ## Flow Dependencies
 
@@ -18,9 +19,10 @@
 | F1 | F5 (for meaningful status) | — |
 | F2 | F5 (engine must be ready) | Annotations (media lifecycle) |
 | F3 | F5 (engine must be ready) | F4 (via SSE event queues), Annotations (settings, media lifecycle) |
-| F4 | — | F3 (receives events) |
+| F4 | — | F3, F7 (receives events) |
 | F5 | — | F6 (triggers conversion if needed) |
 | F6 | F5 (triggered by init failure) | F5 (provides converted bytes) |
+| F7 | F5 (engine must be ready) | F4 (via SSE event queues), Annotations (media lifecycle) |
 
 ---
 
@@ -317,3 +319,255 @@ sequenceDiagram
     INF->>STATUS: set_status(ENABLED)
     Note over INF: Next init_ai() call will load from _converted_model_bytes
 ```
+
+---
+
+## Flow F7: Streaming Video Detection (AZ-178)
+
+### Description
+
+Client uploads a video file as raw binary and gets near-real-time detections via SSE as frames are decoded — **during** the upload, not after. The endpoint bypasses FastAPI's multipart buffering entirely, using `request.stream()` to read the HTTP body chunk-by-chunk. Each chunk is simultaneously written to a temp file (via `StreamingBuffer`) and read by PyAV in a background inference thread. First detections appear within ~500ms of the first decodable frames arriving at the API. Peak memory usage is bounded by the model batch size × frame size (tens of MB), regardless of video file size.
+
+### Activity Diagram — Full Data Pipeline
+
+```mermaid
+flowchart TD
+    subgraph CLIENT ["Client (Browser)"]
+        C1([Open SSE connection<br/>GET /detect/stream])
+        C2([Start upload<br/>POST /detect/video])
+        C3([Receive SSE events<br/>during upload])
+    end
+
+    subgraph API ["API Layer — main.py (async event loop)"]
+        A1[Parse headers:<br/>X-Filename, X-Config, Auth]
+        A2{Valid video<br/>extension?}
+        A3[Create StreamingBuffer<br/>backed by temp file]
+        A4[Start inference thread<br/>via run_in_executor]
+        A5["Read chunk from<br/>request.stream()"]
+        A6[buffer.append chunk<br/>via run_in_executor]
+        A7{More chunks?}
+        A8[buffer.close_writer<br/>signal EOF]
+        A9[Compute content hash<br/>from temp file on disk<br/>reads only 3 KB]
+        A10[Rename temp file →<br/>permanent storage path]
+        A11[Create media record<br/>POST /api/media]
+        A12["Return {status: started,<br/>mediaId: hash}"]
+        A13[Register background task<br/>to await inference completion]
+    end
+
+    subgraph BUF ["StreamingBuffer — streaming_buffer.py"]
+        B1[/"Temp file on disk<br/>(single file, two handles)"/]
+        B2["append(data):<br/>write + flush + notify"]
+        B3["read(size):<br/>block if ahead of writer<br/>return available bytes"]
+        B4["seek(offset, whence):<br/>SEEK_END blocks until EOF"]
+        B5["close_writer():<br/>set EOF flag, notify all"]
+    end
+
+    subgraph INF ["Inference Thread — inference.pyx"]
+        I1["av.open(buffer)<br/>PyAV reads via buffer.read()"]
+        I2{Moov at start?}
+        I3[Decode frames immediately<br/>~500ms latency]
+        I4["Blocks on seek(0, 2)<br/>until upload completes"]
+        I5["Decode batch of frames<br/>(frame_period_recognition sampling)"]
+        I6["engine.process_frames(batch)"]
+        I7{Detections found?}
+        I8["on_annotation callback<br/>→ SSE event broadcast"]
+        I9{More frames?}
+        I10[send_detection_status]
+    end
+
+    C2 --> A1
+    A1 --> A2
+    A2 -->|No| ERR([400 Bad Request])
+    A2 -->|Yes| A3
+    A3 --> A4
+    A4 --> A5
+
+    A5 --> A6
+    A6 --> B2
+    B2 --> B1
+    A6 --> A7
+    A7 -->|Yes| A5
+    A7 -->|No| A8
+    A8 --> B5
+
+    A8 --> A9
+    A9 --> A10
+    A10 --> A11
+    A11 --> A12
+    A12 --> A13
+
+    A4 -.->|background thread| I1
+    I1 --> I2
+    I2 -->|"Yes (faststart MP4,<br/>MKV, WebM)"| I3
+    I2 -->|"No (standard MP4)"| I4
+    I4 --> I3
+    I3 --> I5
+    I5 --> I6
+    I6 --> I7
+    I7 -->|Yes| I8
+    I8 --> C3
+    I7 -->|No| I9
+    I8 --> I9
+    I9 -->|Yes| I5
+    I9 -->|No| I10
+
+    B3 -.->|"PyAV calls<br/>read()"| I1
+
+    style BUF fill:#e8f4fd,stroke:#2196F3
+    style INF fill:#fce4ec,stroke:#e91e63
+    style API fill:#e8f5e9,stroke:#4CAF50
+    style CLIENT fill:#fff3e0,stroke:#FF9800
+```
+
+### Sequence Diagram — Concurrent Timeline
+
+```mermaid
+sequenceDiagram
+    participant Client
+    participant SSE as SSE /detect/stream
+    participant API as main.py (async)
+    participant BUF as StreamingBuffer
+    participant INF as Inference Thread
+    participant PyAV
+    participant ENG as Engine (ONNX/TRT)
+    participant ANN as Annotations Service
+
+    Client->>SSE: GET /detect/stream (open)
+    Client->>API: POST /detect/video (raw body, streaming)
+    API->>API: Parse X-Filename, X-Config, Auth headers
+    API->>BUF: Create StreamingBuffer (temp file)
+    API->>INF: Start in executor thread
+
+    par Upload stream (async event loop) and Inference (background thread)
+        loop Each HTTP body chunk (~8-64 KB)
+            API->>BUF: append(chunk) → write + flush + notify
+        end
+
+        INF->>PyAV: av.open(buffer)
+        Note over PyAV,BUF: PyAV calls buffer.read().<br/>Blocks when no data yet.<br/>Resumes as chunks arrive.
+
+        loop Each decodable frame batch
+            PyAV->>BUF: read(size) → returns available bytes
+            BUF-->>PyAV: video data
+            PyAV-->>INF: decoded frames (BGR numpy)
+            INF->>ENG: process_frames(batch)
+            ENG-->>INF: detections
+            opt Valid detections
+                INF->>SSE: DetectionEvent (via callback)
+                SSE-->>Client: data: {...detections...}
+            end
+        end
+    end
+
+    API->>BUF: close_writer() → EOF signal
+    Note over INF: PyAV reads remaining frames, finishes
+
+    API->>API: compute_media_content_hash_from_file(temp file) — reads 3 KB
+    API->>API: Rename temp file → {hash}{ext}
+
+    opt Authenticated user
+        API->>ANN: POST /api/media (create record)
+        API->>ANN: PUT /api/media/{id}/status (AI_PROCESSING)
+    end
+
+    API-->>Client: {"status": "started", "mediaId": "abc123"}
+
+    Note over API: Background task awaits inference completion
+
+    INF-->>API: Inference completes
+    opt Authenticated user
+        API->>ANN: PUT /api/media/{id}/status (AI_PROCESSED)
+    end
+    API->>SSE: DetectionEvent(status=AIProcessed, percent=100)
+    SSE-->>Client: data: {...status: AIProcessed...}
+```
+
+### Flowchart — StreamingBuffer Read/Write Coordination
+
+```mermaid
+flowchart TD
+    subgraph WRITER ["Writer (HTTP handler thread)"]
+        W1["Receive HTTP chunk"]
+        W2["Acquire Condition lock"]
+        W3["file.write(chunk) + flush()"]
+        W4["_written += len(chunk)"]
+        W5["notify_all() → wake reader"]
+        W6["Release lock"]
+        W7{More chunks?}
+        W8["close_writer():<br/>set _eof = True<br/>notify_all()"]
+    end
+
+    subgraph READER ["Reader (PyAV / Inference thread)"]
+        R1["PyAV calls read(size)"]
+        R2["Acquire Condition lock"]
+        R3{"_written > pos?"}
+        R4["cond.wait()<br/>(releases lock, sleeps)"]
+        R5["Calculate to_read =<br/>min(size, available)"]
+        R6["Release lock"]
+        R7["file.read(to_read)<br/>(outside lock)"]
+        R8["Return bytes to PyAV"]
+        R9{"_eof and<br/>available == 0?"}
+        R10["Return b'' (EOF)"]
+    end
+
+    W1 --> W2 --> W3 --> W4 --> W5 --> W6 --> W7
+    W7 -->|Yes| W1
+    W7 -->|No| W8
+
+    R1 --> R2 --> R3
+    R3 -->|Yes| R5
+    R3 -->|No| R9
+    R9 -->|Yes| R10
+    R9 -->|No| R4
+    R4 -.->|"Woken by<br/>notify_all()"| R3
+    R5 --> R6 --> R7 --> R8
+
+    style WRITER fill:#e8f5e9,stroke:#4CAF50
+    style READER fill:#fce4ec,stroke:#e91e63
+```
+
+### Data Flow
+
+| Step | From | To | Data | Format |
+|------|------|----|------|--------|
+| 1 | Client | API | Raw video bytes (streaming) | HTTP POST body chunks |
+| 2 | API | StreamingBuffer | Byte chunks (8-64 KB each) | `append(bytes)` |
+| 3 | StreamingBuffer | Temp file | Same chunks | `file.write()` + `flush()` |
+| 4 | StreamingBuffer | PyAV (Inference thread) | Byte segments on demand | `read(size)` blocks when ahead |
+| 5 | PyAV | Inference | Decoded BGR numpy frames | ndarray |
+| 6 | Inference | Engine | Preprocessed batch | ndarray |
+| 7 | Engine | Inference | Raw detections | ndarray |
+| 8 | Inference | SSE clients | DetectionEvent | SSE JSON via `loop.call_soon_threadsafe` |
+| 9 | API | Temp file | Content hash (3 KB read) | `compute_media_content_hash_from_file` |
+| 10 | API | Disk | Rename temp → permanent path | `os.rename` |
+| 11 | API | Annotations Service | Media record + status | HTTP POST/PUT JSON |
+
+### Memory Profile (2 GB video)
+
+| Stage | Current (F2) | Streaming (F7) |
+|-------|-------------|----------------|
+| Starlette buffering | 2 GB (SpooledTempFile) | 0 (raw stream) |
+| `file.read()` / chunk buffer | 2 GB (full bytes) | ~64 KB (one chunk) |
+| BytesIO for PyAV | 2 GB (copy) | 0 (reads from buffer) |
+| Writer thread | 2 GB (same ref) | 0 (no separate writer) |
+| **Peak process RAM** | **~4+ GB** | **~50 MB** (batch × frame) |
+
+### Format Compatibility
+
+| Container Format | Moov Location | Streaming Behavior |
+|-----------------|--------------|-------------------|
+| MP4 (faststart) | Beginning | True streaming — first frame decoded in ~500ms |
+| MKV / WebM | Beginning | True streaming — first frame decoded in ~500ms |
+| MP4 (standard) | End of file | Graceful degradation — `seek(0, 2)` blocks until upload completes, then decoding starts |
+| MOV, AVI | Varies | Depends on header location |
+
+### Error Scenarios
+
+| Error | Where | Detection | Recovery |
+|-------|-------|-----------|----------|
+| Non-video extension | API | Extension check | 400 Bad Request |
+| Client disconnects mid-upload | request.stream() | Exception | buffer.close_writer() called in except, inference thread gets EOF |
+| Engine unavailable | Inference thread | engine is None | Error event via SSE |
+| PyAV decode failure | Inference thread | Exception | Error event via SSE, media status set to Error |
+| Disk full | StreamingBuffer.append | OSError | Propagated to API handler |
+| Annotations service down | _post_media_record | Exception caught | Silently continues, detections still work |
diff --git a/_docs/02_tasks/todo/AZ-177_remove_redundant_video_prewrite.md b/_docs/02_tasks/done/AZ-177_remove_redundant_video_prewrite.md
similarity index 100%
rename from _docs/02_tasks/todo/AZ-177_remove_redundant_video_prewrite.md
rename to _docs/02_tasks/done/AZ-177_remove_redundant_video_prewrite.md
diff --git a/_docs/02_tasks/todo/AZ-178_true_streaming_video_detect.md b/_docs/02_tasks/todo/AZ-178_true_streaming_video_detect.md
new file mode 100644
index 0000000..f26df5c
--- /dev/null
+++ b/_docs/02_tasks/todo/AZ-178_true_streaming_video_detect.md
@@ -0,0 +1,107 @@
+# True Streaming Video Detection
+
+**Task**: AZ-178_true_streaming_video_detect
+**Name**: Start inference as upload bytes arrive — no buffering
+**Description**: Replace the fully-buffered `/detect` upload flow with a true streaming pipeline where video bytes flow simultaneously to disk and to PyAV for frame decoding + inference. First detection must appear within ~500ms of first decodable frames arriving at the API.
+**Complexity**: 5 points
+**Dependencies**: AZ-173 (stream-based run_detect)
+**Component**: Main, Inference, MediaHash
+**Jira**: AZ-178
+**Parent**: AZ-172
+
+## Problem
+
+The current `/detect` endpoint has three sequential blocking stages before any detection runs:
+
+1. **Starlette multipart buffering**: `UploadFile = File(...)` causes Starlette to consume the entire HTTP body and spool it to a `SpooledTemporaryFile` before the handler is called. For 2 GB → user waits for full upload.
+2. **Full RAM load**: `await file.read()` copies the entire spooled file into a `bytes` object in RAM. For 2 GB → ~2 GB+ allocated.
+3. **BytesIO + writer thread**: `run_detect_video(video_bytes, ...)` wraps `bytes` in `io.BytesIO` for PyAV and spawns a separate thread to write the same bytes to disk. For 2 GB → ~4 GB RAM total + double disk write.
+
+Net result: zero detection output until the entire file is uploaded AND loaded into RAM.
+
+## Target State
+
+```
+HTTP chunks ──┬──▸ StreamingBuffer (temp file) ──▸ PyAV decode ──▸ inference ──▸ SSE
+              └──▸ (same temp file serves as permanent storage after rename)
+```
+
+- Bytes flow chunk-by-chunk from the network into a `StreamingBuffer`
+- PyAV reads from the same buffer concurrently — blocks when ahead of the writer, resumes as new data arrives
+- No intermediate `bytes` object holds the full file in RAM
+- Peak memory: ~model batch size × frame size (tens of MB), not file size
+
+## Technical Design
+
+### 1. StreamingBuffer (`src/streaming_buffer.py`)
+
+A file-like object backed by a temp file with concurrent append + read:
+
+- `append(data)` — called from the async HTTP handler (via executor); writes to temp file, flushes, notifies readers
+- `read(size)` — called by PyAV; blocks via `Condition.wait()` when data not yet available
+- `seek(offset, whence)` — supports SEEK_SET/SEEK_CUR normally; SEEK_END blocks until writer signals EOF (graceful degradation for non-faststart MP4)
+- `tell()`, `seekable()`, `readable()` — standard file protocol
+- `close_writer()` — signals EOF
+- Thread-safe via `threading.Condition`
+
+**Format compatibility:**
+- Faststart MP4, MKV, WebM → true streaming (moov/header at start)
+- Standard MP4 (moov at end) → SEEK_END blocks until upload completes, then decoding starts (correct, just not streaming)
+
+### 2. `run_detect_video_stream` in `inference.pyx`
+
+New method accepting a file-like `readable` instead of `bytes`:
+
+```python
+cpdef run_detect_video_stream(self, object readable, AIRecognitionConfig ai_config,
+                               str media_name, object annotation_callback,
+                               object status_callback=None)
+```
+
+- Opens `av.open(readable)` directly — PyAV calls `read()`/`seek()` on the StreamingBuffer
+- Reuses existing `_process_video_pyav` for frame decode → batch inference
+- No writer thread needed (StreamingBuffer already persists to disk)
+
+### 3. `compute_media_content_hash_from_file` in `media_hash.py`
+
+File-based variant of `compute_media_content_hash` that reads only 3 sampling regions (3 KB) from disk instead of loading the entire file:
+
+```python
+def compute_media_content_hash_from_file(path: str) -> str
+```
+
+Produces identical hashes to the existing `compute_media_content_hash(data)`.
+
+### 4. `POST /detect/video` endpoint in `main.py`
+
+New endpoint — raw binary body (not multipart), bypassing Starlette's buffering:
+
+- Filename via `X-Filename` header, config via `X-Config` header
+- Auth via `Authorization` / `X-Refresh-Token` headers (same as existing)
+- Uses `request.stream()` for async chunk iteration
+- Creates `StreamingBuffer`, starts inference in executor thread
+- Feeds chunks to buffer via `run_in_executor` (non-blocking event loop)
+- After upload completes: compute hash from file, rename to permanent path, create media record
+- Returns `{"status": "started", "mediaId": "<hash>"}` — inference continues in background
+- Detections flow via existing SSE `/detect/stream`
+
+## Acceptance Criteria
+
+- [ ] Video detection starts as soon as first frames are decodable (~500ms for faststart formats)
+- [ ] 2 GB video never loads entirely into RAM (peak memory < 100 MB for the streaming pipeline)
+- [ ] Video bytes written to disk exactly once (no double-write)
+- [ ] Standard MP4 (moov at end) still works correctly (graceful degradation)
+- [ ] Detections delivered via SSE in real-time during upload
+- [ ] Content hash identical to existing `compute_media_content_hash`
+- [ ] All existing tests pass
+- [ ] Existing `/detect` endpoint unchanged (images and legacy callers unaffected)
+
+## File Changes
+
+| File | Action | Description |
+|------|--------|-------------|
+| `src/streaming_buffer.py` | New | StreamingBuffer class |
+| `src/inference.pyx` | Modified | Add `run_detect_video_stream` method |
+| `src/media_hash.py` | Modified | Add `compute_media_content_hash_from_file` |
+| `src/main.py` | Modified | Add `POST /detect/video` endpoint |
+| `tests/test_streaming_buffer.py` | New | Unit tests for StreamingBuffer |
diff --git a/_docs/03_implementation/implementation_report_streaming_video.md b/_docs/03_implementation/implementation_report_streaming_video.md
new file mode 100644
index 0000000..98084ab
--- /dev/null
+++ b/_docs/03_implementation/implementation_report_streaming_video.md
@@ -0,0 +1,74 @@
+# Implementation Report — True Streaming Video Detection
+
+**Date**: 2026-04-01
+**Task**: AZ-178
+**Complexity**: 5 points
+**Parent**: AZ-172
+
+## Summary
+
+Implemented a true streaming video detection pipeline. The new `POST /detect/video` endpoint bypasses Starlette's multipart buffering entirely — bytes flow directly from the HTTP body to PyAV frame decoding and inference via a `StreamingBuffer`, simultaneously persisting to disk. For faststart MP4/MKV/WebM, first detections appear within ~500ms of first frames arriving. Peak memory is bounded by model batch size, not file size.
+
+## Problem
+
+The existing `POST /detect` endpoint had three sequential blocking stages for a 2 GB video:
+1. Starlette `UploadFile` spools entire body to temp file (~2 GB on disk, client waits)
+2. `await file.read()` loads entire file into RAM (~2 GB)
+3. `run_detect_video` wraps bytes in `BytesIO` for PyAV + spawns writer thread (~4 GB peak RAM, double disk write)
+
+Zero detection output until full upload + full RAM load completed.
+
+## Solution
+
+| Component | What it does |
+|-----------|-------------|
+| `StreamingBuffer` | File-like object backed by temp file. Writer appends chunks, reader blocks until data arrives. Thread-safe via `Condition`. |
+| `run_detect_video_stream` | New inference method — `av.open(readable)` on the buffer. Reuses `_process_video_pyav`. No writer thread needed. |
+| `compute_media_content_hash_from_file` | Reads only 3 KB sampling regions from disk (identical hashes to bytes-based version). |
+| `POST /detect/video` | Raw binary body via `request.stream()`. Starts inference immediately, feeds chunks to buffer, detections stream via SSE. |
+
+## File Changes
+
+| File | Action | Lines | Description |
+|------|--------|-------|-------------|
+| `src/streaming_buffer.py` | New | 84 | StreamingBuffer class |
+| `src/inference.pyx` | Modified | +19 | `run_detect_video_stream` method |
+| `src/media_hash.py` | Modified | +17 | `compute_media_content_hash_from_file` function |
+| `src/main.py` | Modified | +130 | `POST /detect/video` endpoint |
+| `tests/test_az178_streaming_video.py` | New | ~200 | 14 unit tests (StreamingBuffer, hash, endpoint) |
+| `e2e/tests/test_streaming_video_upload.py` | New | ~250 | 2 e2e tests (faststart streaming, non-faststart fallback) |
+
+## Documentation Updated
+
+| File | Changes |
+|------|---------|
+| `_docs/02_document/system-flows.md` | Added Flow F7 with activity diagram, sequence diagram, buffer coordination flowchart, memory profile, format compatibility table |
+| `_docs/02_document/modules/main.md` | Added `/detect/video` endpoint docs |
+| `_docs/02_document/modules/inference.md` | Added `run_detect_video_stream` method docs |
+| `_docs/02_document/modules/media_hash.md` | Added `compute_media_content_hash_from_file` docs |
+| `_docs/02_document/modules/streaming_buffer.md` | New module documentation |
+| `_docs/02_document/components/04_api/description.md` | Added endpoint spec, dependency graph |
+| `_docs/02_document/components/03_inference_pipeline/description.md` | Added streaming video processing section |
+
+## Memory Profile (2 GB video)
+
+| Stage | Before (POST /detect) | After (POST /detect/video) |
+|-------|----------------------|---------------------------|
+| HTTP buffering | 2 GB (SpooledTempFile) | 0 (raw stream) |
+| File → RAM | 2 GB (file.read()) | ~64 KB (one chunk) |
+| PyAV input | 2 GB (BytesIO copy) | 0 (reads from buffer) |
+| Writer thread | 2 GB (same ref) | 0 (not needed) |
+| **Peak process RAM** | **~4+ GB** | **~50 MB** (batch x frame) |
+
+## Format Behavior
+
+| Format | Behavior |
+|--------|----------|
+| MP4 (faststart) | True streaming — ~500ms to first detection |
+| MKV / WebM | True streaming — ~500ms to first detection |
+| MP4 (moov at end) | Graceful degradation — blocks until upload completes, then decodes |
+
+## Test Results
+
+36/36 unit tests passed (18 existing + 18 new).
+2 e2e tests added (require deployed service + video fixtures).
diff --git a/_docs/03_implementation/implementation_report_video_prewrite.md b/_docs/03_implementation/implementation_report_video_prewrite.md
new file mode 100644
index 0000000..dfefff4
--- /dev/null
+++ b/_docs/03_implementation/implementation_report_video_prewrite.md
@@ -0,0 +1,31 @@
+# Implementation Report — Video Pre-Write Removal
+
+**Date**: 2026-04-01
+**Tasks**: 1
+**Batches**: 1
+**Total Complexity**: 2 points
+
+## Summary
+
+Removed redundant synchronous video pre-writes in the `/detect` endpoint. Videos are now written to disk exactly once by `inference.pyx`'s background writer thread, concurrently with frame detection.
+
+## Tasks Completed
+
+| Task | Name | Complexity | Status |
+|------|------|-----------|--------|
+| AZ-177 | remove_redundant_video_prewrite | 2 | Done |
+
+## Changes
+
+| File | Change |
+|------|--------|
+| src/main.py | Auth'd path: wrap file write with `if kind == "image"` so videos skip pre-write |
+| src/main.py | Non-auth'd path: remove `open().write()` after `mkstemp` — inference handles the write |
+
+## Test Results
+
+18/18 tests passed. 4 new tests added:
+- `test_auth_video_storage_path_opened_wb_once`
+- `test_non_auth_temp_video_opened_wb_once_and_removed`
+- `test_auth_image_still_writes_once_before_detect`
+- `test_video_writer_runs_in_separate_thread_from_executor`
diff --git a/_docs/04_deploy/ci_cd_pipeline.md b/_docs/04_deploy/ci_cd_pipeline.md
new file mode 100644
index 0000000..e963e51
--- /dev/null
+++ b/_docs/04_deploy/ci_cd_pipeline.md
@@ -0,0 +1,148 @@
+# CI/CD Pipeline
+
+## Platform
+
+GitHub Actions (default recommendation; adaptable to Azure Pipelines).
+
+## Pipeline Stages
+
+| Stage | Trigger | Steps | Quality Gate |
+|-------|---------|-------|-------------|
+| Lint | Every push | `black --check`, Cython syntax check | Zero errors |
+| Unit Test | Every push | `pytest tests/ -v --csv=report.csv` | All pass |
+| Security Scan | Every push | `pip-audit`, Trivy image scan | Zero critical/high CVEs |
+| Build | PR merge to dev | Build `detections-cpu` and `detections-gpu` images, tag with git SHA | Build succeeds |
+| E2E Test | After build | `docker compose -f e2e/docker-compose.test.yml up --abort-on-container-exit` | All e2e tests pass |
+| Push | After e2e | Push images to container registry | Push succeeds |
+| Deploy Staging | After push | Deploy to staging via `scripts/deploy.sh` | Health check passes |
+| Deploy Production | Manual approval | Deploy to production via `scripts/deploy.sh` | Health check passes |
+
+## Workflow Definition (GitHub Actions)
+
+```yaml
+name: CI/CD
+
+on:
+  push:
+    branches: [dev, main]
+  pull_request:
+    branches: [dev]
+
+env:
+  REGISTRY: ${{ secrets.REGISTRY }}
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - run: pip install black
+      - run: black --check src/ tests/
+
+  test:
+    runs-on: ubuntu-latest
+    needs: lint
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - run: pip install -r requirements.txt
+      - run: python setup.py build_ext --inplace
+      - run: cd src && pytest ../tests/ -v
+
+  security:
+    runs-on: ubuntu-latest
+    needs: lint
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - run: pip install pip-audit
+      - run: pip-audit -r requirements.txt
+
+  build:
+    runs-on: ubuntu-latest
+    needs: [test, security]
+    if: github.event_name == 'push'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.REGISTRY_USER }}
+          password: ${{ secrets.REGISTRY_PASSWORD }}
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.REGISTRY }}/azaion/detections-cpu:${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  e2e:
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
+          cd e2e
+          COMPOSE_PROFILES=cpu docker compose -f docker-compose.test.yml up \
+            --build --abort-on-container-exit --exit-code-from e2e-runner
+
+  deploy-staging:
+    runs-on: ubuntu-latest
+    needs: e2e
+    if: github.ref == 'refs/heads/dev'
+    environment: staging
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
+          IMAGE_TAG=${{ github.sha }} \
+          DEPLOY_HOST=${{ secrets.STAGING_HOST }} \
+          DEPLOY_USER=${{ secrets.STAGING_USER }} \
+          bash scripts/deploy.sh
+
+  deploy-production:
+    runs-on: ubuntu-latest
+    needs: e2e
+    if: github.ref == 'refs/heads/main'
+    environment:
+      name: production
+      url: ${{ secrets.PRODUCTION_URL }}
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
+          IMAGE_TAG=${{ github.sha }} \
+          DEPLOY_HOST=${{ secrets.PRODUCTION_HOST }} \
+          DEPLOY_USER=${{ secrets.PRODUCTION_USER }} \
+          bash scripts/deploy.sh
+```
+
+## Caching Strategy
+
+| Cache Type | Scope | Tool |
+|-----------|-------|------|
+| Python dependencies | Per requirements.txt hash | actions/cache + pip cache dir |
+| Docker layers | Per Dockerfile hash | BuildKit GHA cache |
+| Cython compiled modules | Per src/ hash | actions/cache |
+
+## Parallelization
+
+- `test` and `security` jobs run in parallel after `lint`
+- `build` waits for both `test` and `security`
+- GPU image build can be added as a parallel job to CPU build
+
+## Notifications
+
+| Event | Channel |
+|-------|---------|
+| Build failure | GitHub PR status check (blocks merge) |
+| Security scan failure | GitHub PR status check + team notification |
+| Deployment success/failure | Deployment environment status |
diff --git a/_docs/04_deploy/containerization.md b/_docs/04_deploy/containerization.md
new file mode 100644
index 0000000..83d7951
--- /dev/null
+++ b/_docs/04_deploy/containerization.md
@@ -0,0 +1,125 @@
+# Containerization Plan
+
+## Image Variants
+
+### detections-cpu (Dockerfile)
+
+| Aspect | Specification |
+|--------|--------------|
+| Base image | `python:3.11-slim` (pinned digest recommended) |
+| Build stages | Single stage (Cython compile requires gcc at runtime for setup.py) |
+| Non-root user | `adduser --disabled-password --gecos '' appuser` + `USER appuser` |
+| Health check | `HEALTHCHECK --interval=30s --timeout=5s CMD curl -f http://localhost:8080/health \|\| exit 1` |
+| Exposed ports | 8080 |
+| Entrypoint | `uvicorn main:app --host 0.0.0.0 --port 8080` |
+
+**Changes needed to existing Dockerfile**:
+1. Add non-root user (security finding F7)
+2. Add HEALTHCHECK directive
+3. Pin `python:3.11-slim` to specific digest
+4. Add `curl` to apt-get install (for health check)
+
+### detections-gpu (Dockerfile.gpu)
+
+| Aspect | Specification |
+|--------|--------------|
+| Base image | `nvidia/cuda:12.2.0-runtime-ubuntu22.04` |
+| Build stages | Single stage |
+| Non-root user | `adduser --disabled-password --gecos '' appuser` + `USER appuser` |
+| Health check | `HEALTHCHECK --interval=30s --timeout=5s CMD curl -f http://localhost:8080/health \|\| exit 1` |
+| Exposed ports | 8080 |
+| Entrypoint | `uvicorn main:app --host 0.0.0.0 --port 8080` |
+| Runtime | Requires `--runtime=nvidia` or `nvidia` runtime in Docker |
+
+**Changes needed to existing Dockerfile.gpu**:
+1. Add non-root user
+2. Add HEALTHCHECK directive
+3. Add `curl` to apt-get install
+
+### .dockerignore
+
+```
+.git
+.gitignore
+_docs/
+_standalone/
+e2e/
+tests/
+*.md
+.env
+.env.*
+.cursor/
+.venv/
+venv/
+__pycache__/
+*.pyc
+build/
+dist/
+*.egg-info
+Logs/
+```
+
+## Docker Compose — Local Development
+
+`docker-compose.yml` (already partially exists as `e2e/docker-compose.mocks.yml`):
+
+```yaml
+name: detections-dev
+
+services:
+  mock-loader:
+    build: ./e2e/mocks/loader
+    ports:
+      - "18080:8080"
+    volumes:
+      - ./e2e/fixtures:/models
+    networks:
+      - dev-net
+
+  mock-annotations:
+    build: ./e2e/mocks/annotations
+    ports:
+      - "18081:8081"
+    networks:
+      - dev-net
+
+  detections:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "8080:8080"
+    depends_on:
+      - mock-loader
+      - mock-annotations
+    env_file: .env
+    environment:
+      LOADER_URL: http://mock-loader:8080
+      ANNOTATIONS_URL: http://mock-annotations:8081
+    volumes:
+      - ./e2e/fixtures/classes.json:/app/classes.json:ro
+      - detections-logs:/app/Logs
+    shm_size: 512m
+    networks:
+      - dev-net
+
+volumes:
+  detections-logs:
+
+networks:
+  dev-net:
+    driver: bridge
+```
+
+## Docker Compose — Blackbox Tests
+
+Already exists: `e2e/docker-compose.test.yml`. No changes needed — supports both `cpu` and `gpu` profiles with mock services and test runner.
+
+## Image Tagging Strategy
+
+| Context | Tag Format | Example |
+|---------|------------|---------|
+| CI builds | `<registry>/azaion/detections-cpu:<git-sha>` | `registry.example.com/azaion/detections-cpu:a1b2c3d` |
+| CI builds (GPU) | `<registry>/azaion/detections-gpu:<git-sha>` | `registry.example.com/azaion/detections-gpu:a1b2c3d` |
+| Local development | `detections-cpu:dev` | — |
+| Latest stable | `<registry>/azaion/detections-cpu:latest` | Updated on merge to main |
diff --git a/_docs/04_deploy/deploy_scripts.md b/_docs/04_deploy/deploy_scripts.md
new file mode 100644
index 0000000..57374e4
--- /dev/null
+++ b/_docs/04_deploy/deploy_scripts.md
@@ -0,0 +1,65 @@
+# Deployment Scripts
+
+All scripts are in `scripts/` at the project root. Each script is POSIX-compatible (`bash` with `set -euo pipefail`), sources `.env` from the project root, and supports `--help`.
+
+## Script Reference
+
+| Script | Purpose | Key Env Vars |
+|--------|---------|-------------|
+| `deploy.sh` | Orchestrates full deployment | REGISTRY, IMAGE_TAG, DEPLOY_HOST, DEPLOY_USER |
+| `pull-images.sh` | Pulls Docker images from registry | REGISTRY, IMAGE_TAG |
+| `start-services.sh` | Starts the detections container | REGISTRY, IMAGE_TAG, LOADER_URL, ANNOTATIONS_URL |
+| `stop-services.sh` | Gracefully stops and removes container | — |
+| `health-check.sh` | Verifies `/health` endpoint | HEALTH_CHECK_HOST, HEALTH_CHECK_PORT |
+
+## Usage
+
+### Full Deployment
+
+```bash
+REGISTRY=registry.example.com IMAGE_TAG=a1b2c3d bash scripts/deploy.sh
+```
+
+### Remote Deployment (via SSH)
+
+```bash
+REGISTRY=registry.example.com \
+IMAGE_TAG=a1b2c3d \
+DEPLOY_HOST=production-server.example.com \
+DEPLOY_USER=deploy \
+bash scripts/deploy.sh
+```
+
+### Rollback
+
+```bash
+bash scripts/deploy.sh --rollback
+```
+
+Redeploys the previous image tag (saved in `.deploy-previous-tag` by `stop-services.sh`).
+
+### Local Development
+
+```bash
+docker compose -f docker-compose.yml up
+```
+
+Or using the e2e compose for testing:
+
+```bash
+cd e2e && COMPOSE_PROFILES=cpu docker compose -f docker-compose.test.yml up --build
+```
+
+## Deployment Flow
+
+```
+deploy.sh
+  ├── pull-images.sh    → docker pull
+  ├── stop-services.sh  → docker stop (30s grace) + save previous tag
+  ├── start-services.sh → docker run
+  └── health-check.sh   → curl /health (10 retries, 3s interval)
+```
+
+## Environment Variables
+
+All scripts source `.env` from the project root if it exists. Variables can also be passed directly via the environment. See `.env.example` for the full list.
diff --git a/_docs/04_deploy/deployment_procedures.md b/_docs/04_deploy/deployment_procedures.md
new file mode 100644
index 0000000..03d4ffc
--- /dev/null
+++ b/_docs/04_deploy/deployment_procedures.md
@@ -0,0 +1,70 @@
+# Deployment Procedures
+
+## Deployment Strategy
+
+**Pattern**: Rolling deployment (single-instance service; blue-green not applicable without load balancer).
+
+**Rationale**: The Detections service is a single-instance stateless service with in-memory state (`_active_detections`, `_event_queues`). Rolling replacement with graceful shutdown is the simplest approach.
+
+**Zero-downtime**: Achievable only with load-balanced multi-instance setup. For single-instance deployments, brief downtime (~10-30s) during container restart is expected.
+
+## Deployment Flow
+
+```
+1. Pull new images → pull-images.sh
+2. Stop current services gracefully → stop-services.sh (30s grace)
+3. Start new services → start-services.sh
+4. Verify health → health-check.sh
+5. If unhealthy → rollback (deploy.sh --rollback)
+```
+
+## Health Checks
+
+| Check | Type | Endpoint | Interval | Threshold |
+|-------|------|----------|----------|-----------|
+| Liveness | HTTP GET | `/health` | 30s | 3 failures → restart |
+| Readiness | HTTP GET | `/health` (check `aiAvailability` != "None") | 10s | Wait for engine init |
+
+**Note**: The service's `/health` endpoint returns `{"status": "healthy"}` immediately, with `aiAvailability` transitioning through DOWNLOADING → CONVERTING → ENABLED on first inference request (lazy init). Readiness depends on the use case — the API is ready to accept requests immediately, but inference is only ready after engine initialization.
+
+## Rollback Procedures
+
+**Trigger criteria**:
+- Health check fails after deployment (3 consecutive failures)
+- Error rate spike detected in monitoring
+- Critical bug reported by users
+
+**Rollback steps**:
+1. Run `deploy.sh --rollback` — redeploys the previous image tag
+2. Verify health via `health-check.sh`
+3. Notify stakeholders of rollback
+4. Investigate root cause
+
+**Previous image tag**: Saved by `stop-services.sh` to `.deploy-previous-tag` before each deployment.
+
+## Deployment Checklist
+
+Pre-deployment:
+- [ ] All CI pipeline stages pass (lint, test, security, e2e)
+- [ ] Security scan clean (zero critical/high CVEs)
+- [ ] Environment variables configured on target
+- [ ] `classes.json` available on target (or mounted volume)
+- [ ] ONNX model available in Loader service
+- [ ] Loader and Annotations services reachable from target
+
+Post-deployment:
+- [ ] `/health` returns 200
+- [ ] `aiAvailability` transitions to ENABLED after first request
+- [ ] SSE stream endpoint accessible
+- [ ] Detection request returns valid results
+- [ ] Logs show no errors
+
+## Graceful Shutdown
+
+**Current limitation**: No graceful shutdown implemented. In-progress detections are aborted on container stop. Background TensorRT conversion runs in a daemon thread (terminates with process).
+
+**Mitigation**: The `stop-services.sh` script sends SIGTERM with a 30-second grace period, allowing uvicorn to finish in-flight HTTP requests. Long-running video detections may be interrupted.
+
+## Database Migrations
+
+Not applicable — the Detections service has no database. All persistence is delegated to the Annotations service.
diff --git a/_docs/04_deploy/environment_strategy.md b/_docs/04_deploy/environment_strategy.md
new file mode 100644
index 0000000..3fd4c4d
--- /dev/null
+++ b/_docs/04_deploy/environment_strategy.md
@@ -0,0 +1,42 @@
+# Environment Strategy
+
+## Environments
+
+| Environment | Purpose | Infrastructure | Data |
+|-------------|---------|---------------|------|
+| Development | Local developer workflow | docker-compose with mock services | Mock Loader serves test ONNX model; mock Annotations accepts all requests |
+| Staging | Pre-production validation | Mirrors production topology (Docker or K8s) | Real Loader with test model; real Annotations with test database |
+| Production | Live system | Docker with GPU (TensorRT) + reverse proxy | Real Loader, real Annotations, production model |
+
+## Environment Variable Management
+
+| Source | Environment | Method |
+|--------|-------------|--------|
+| `.env` file | Development | Loaded by docker-compose; git-ignored |
+| `.env.example` | All | Template committed to VCS (no secrets) |
+| Secret manager | Staging/Production | Inject via deployment scripts or K8s secrets |
+
+All required variables are listed in `.env.example`. The application fails fast on missing `classes.json` (startup crash) but uses safe defaults for all other variables.
+
+## Secrets Management
+
+| Secret | Development | Staging | Production |
+|--------|-------------|---------|------------|
+| Container registry credentials | Local registry or none | CI/CD secret | CI/CD secret |
+| SSH deploy key | N/A | CI/CD secret | CI/CD secret |
+| Bearer tokens | Test tokens from mock | Real auth service | Real auth service |
+
+**Rotation policy**: Registry credentials and deploy keys should be rotated every 90 days. Bearer tokens are per-request (no stored credentials in the service).
+
+**No secrets stored by the service**: Detections is stateless — tokens come from client HTTP headers and are forwarded to the Annotations service. No database credentials, API keys, or encryption keys are needed.
+
+## Configuration Per Environment
+
+| Config | Development | Staging | Production |
+|--------|-------------|---------|------------|
+| LOADER_URL | http://mock-loader:8080 | http://loader:8080 | http://loader:8080 |
+| ANNOTATIONS_URL | http://mock-annotations:8081 | http://annotations:8080 | http://annotations:8080 |
+| GPU | Not required (ONNX CPU) | Optional | Required (TensorRT) |
+| Log level | DEBUG (stdout) | INFO (file + stdout) | INFO (file) |
+| TLS | None | Reverse proxy | Reverse proxy |
+| Rate limiting | None | Reverse proxy (optional) | Reverse proxy (required) |
diff --git a/_docs/04_deploy/observability.md b/_docs/04_deploy/observability.md
new file mode 100644
index 0000000..04880c3
--- /dev/null
+++ b/_docs/04_deploy/observability.md
@@ -0,0 +1,69 @@
+# Observability
+
+## Logging
+
+**Current state**: loguru with daily rotated files (`Logs/log_inference_YYYYMMDD.txt`, 30-day retention) + stdout/stderr. Format: `[HH:mm:ss LEVEL] message`.
+
+**Recommended improvements**:
+
+| Aspect | Current | Recommended |
+|--------|---------|-------------|
+| Format | Human-readable | Structured JSON to stdout (container-friendly) |
+| Fields | timestamp, level, message | + service, correlation_id, context |
+| PII | Not applicable | No user IDs or tokens in logs |
+| Retention | 30 days (file) | Console in dev; 7 days staging; 30 days production (via log aggregator) |
+
+**Container logging pattern**: Log to stdout/stderr only; let the container runtime (Docker/K8s) handle log collection and routing. Remove file-based logging in containerized deployments.
+
+## Metrics
+
+**Recommended `/metrics` endpoint** (Prometheus-compatible):
+
+| Metric | Type | Labels | Description |
+|--------|------|--------|-------------|
+| `detections_requests_total` | Counter | method, endpoint, status | Total HTTP requests |
+| `detections_request_duration_seconds` | Histogram | method, endpoint | Request processing time |
+| `detections_inference_duration_seconds` | Histogram | media_type (image/video) | Inference processing time |
+| `detections_active_inferences` | Gauge | — | Currently running inference jobs (0-2) |
+| `detections_sse_clients` | Gauge | — | Connected SSE clients |
+| `detections_engine_status` | Gauge | engine_type | 1=ready, 0=not ready |
+
+**Collection**: Prometheus scrape at 15s intervals.
+
+## Distributed Tracing
+
+**Limited applicability**: The Detections service makes outbound HTTP calls to Loader and Annotations. Trace context propagation is recommended for cross-service correlation.
+
+| Span | Parent | Description |
+|------|--------|-------------|
+| `detections.detect_image` | Client request | Full image detection flow |
+| `detections.detect_video` | Client request | Full video detection flow |
+| `detections.model_download` | detect_* | Model download from Loader |
+| `detections.post_annotation` | detect_* | Annotation POST to Annotations service |
+
+**Implementation**: OpenTelemetry Python SDK with OTLP exporter. Sampling: 100% in dev/staging, 10% in production.
+
+## Alerting
+
+| Severity | Response Time | Condition |
+|----------|---------------|-----------|
+| Critical | 5 min | Health endpoint returns non-200; container restart loop |
+| High | 30 min | Error rate > 5%; inference duration p95 > 10s |
+| Medium | 4 hours | SSE client count = 0 for extended period; disk > 80% |
+| Low | Next business day | Elevated log warnings; model download retries |
+
+## Dashboards
+
+**Operations dashboard**:
+- Service health status
+- Request rate by endpoint
+- Inference duration histogram (p50, p95, p99)
+- Active inference count (0-2 gauge)
+- SSE connected clients
+- Error rate by type
+
+**Inference dashboard**:
+- Detections per frame/video
+- Model availability status timeline
+- Engine type distribution (ONNX vs TensorRT)
+- Video batch processing rate
diff --git a/_docs/04_deploy/reports/deploy_status_report.md b/_docs/04_deploy/reports/deploy_status_report.md
new file mode 100644
index 0000000..7ba75b4
--- /dev/null
+++ b/_docs/04_deploy/reports/deploy_status_report.md
@@ -0,0 +1,56 @@
+# Deployment Status Report
+
+**Date**: 2026-03-31
+**Project**: Azaion.Detections
+
+## Component Readiness
+
+| Component | Status | Notes |
+|-----------|--------|-------|
+| Detections Service (CPU) | Implemented & Tested | Dockerfile exists, e2e tests pass |
+| Detections Service (GPU) | Implemented & Tested | Dockerfile.gpu exists, e2e tests pass |
+| E2E Test Suite | Implemented | docker-compose.test.yml exists |
+| Mock Services (Loader, Annotations) | Implemented | docker-compose.mocks.yml exists |
+
+## External Dependencies
+
+| Dependency | Type | Required | Notes |
+|------------|------|----------|-------|
+| Loader Service | REST API | Yes | Model download/upload |
+| Annotations Service | REST API | Yes | Result persistence, auth refresh |
+| classes.json | Config file | Yes | Mounted into container |
+| ONNX model (azaion.onnx) | AI model | Yes | Served by Loader |
+
+## Infrastructure Prerequisites
+
+| Prerequisite | Status | Action Required |
+|-------------|--------|-----------------|
+| Container registry | Needed | User must provide registry URL |
+| Docker runtime | Available | Both CPU and GPU Dockerfiles exist |
+| NVIDIA runtime | Needed for GPU | `nvidia-docker` required for Dockerfile.gpu |
+| Reverse proxy / TLS | Needed | No TLS at application level; requires external termination |
+| DNS / Service discovery | Needed | Services communicate by hostname |
+| Persistent storage | Not needed | Service is stateless |
+
+## Environment Variables
+
+| Variable | Default | Required | Purpose |
+|----------|---------|----------|---------|
+| LOADER_URL | http://loader:8080 | Yes | Loader service endpoint |
+| ANNOTATIONS_URL | http://annotations:8080 | Yes | Annotations service endpoint |
+| CLASSES_JSON_PATH | classes.json | Yes | Detection class definitions |
+| LOG_DIR | Logs | No | Log output directory |
+| VIDEOS_DIR | ./data/videos | No | Upload video storage |
+| IMAGES_DIR | ./data/images | No | Upload image storage |
+
+## Deployment Blockers
+
+| Blocker | Severity | Mitigation |
+|---------|----------|------------|
+| Security findings (FAIL verdict) | High | Fix Critical/High CVEs before production (see `_docs/05_security/security_report.md`) |
+| Containers run as root | Medium | Add non-root USER directive to Dockerfiles |
+| No CI/CD pipeline | Medium | Define and implement pipeline |
+
+## Recommendation
+
+The application is functionally ready for deployment. Security findings from the audit should be addressed before production deployment — at minimum, pin dependencies to fix CVE-2025-43859 and CVE-2026-28356.
diff --git a/_docs/05_security/dependency_scan.md b/_docs/05_security/dependency_scan.md
new file mode 100644
index 0000000..8f6c33d
--- /dev/null
+++ b/_docs/05_security/dependency_scan.md
@@ -0,0 +1,56 @@
+# Dependency Scan
+
+**Date**: 2026-03-31
+**Tool**: Manual review + WebSearch (pip-audit not installed)
+**Files scanned**: `requirements.txt`, `requirements-macos.txt`, `requirements-gpu.txt`, `e2e/requirements.txt`
+
+## Findings
+
+### CVE-2026-28356 — python-multipart ReDoS (HIGH, CVSS 7.5)
+
+- **Package**: python-multipart (unpinned in `requirements.txt`)
+- **Vulnerability**: `parse_options_header()` uses a regex with ambiguous alternation causing exponential backtracking on malicious HTTP/multipart headers
+- **Impact**: Denial of service against the FastAPI application
+- **Fixed in**: 1.2.2, 1.3.1, 1.4.0-dev
+- **Remediation**: Pin `python-multipart>=1.3.1`
+
+### CVE-2025-43859 — h11 HTTP Request Smuggling via uvicorn (CRITICAL, CVSS 9.1)
+
+- **Package**: uvicorn (unpinned in `requirements.txt`) → transitive dependency h11-0.14.0
+- **Vulnerability**: Lenient parsing of line terminators in chunked-coding message bodies enables HTTP request smuggling
+- **Impact**: Bypass security controls, cache poisoning, session hijacking, data leakage
+- **Fixed in**: h11 update required; check for h11>=0.15.0
+- **Remediation**: Pin `h11>=0.15.0` or verify uvicorn version pulls a patched h11
+
+### Unpinned Dependencies — Supply Chain Risk (MEDIUM)
+
+- **Packages**: `fastapi`, `uvicorn[standard]`, `python-multipart` in `requirements.txt`; `pytest`, `sseclient-py`, `flask`, `gunicorn` in `e2e/requirements.txt`
+- **Risk**: Unpinned packages may resolve to different (potentially vulnerable) versions across environments. A compromised PyPI upload could be silently pulled in.
+- **Remediation**: Pin all dependencies to specific versions
+
+### opencv-python — Outdated (LOW)
+
+- **Package**: opencv-python==4.10.0.84
+- **Vulnerability**: No known CVEs for 4.10.0.84, but latest is 4.13.0.92
+- **Remediation**: Consider upgrading to 4.13.0.92 when convenient
+
+## Summary
+
+| Severity | Count |
+|----------|-------|
+| Critical | 1     |
+| High     | 1     |
+| Medium   | 1     |
+| Low      | 1     |
+
+## No Known Vulnerabilities
+
+- requests==2.32.4 (patched version)
+- numpy==2.3.0
+- onnxruntime==1.22.0
+- loguru==0.7.3
+- av==14.2.0
+- xxhash==3.5.0
+- Cython==3.2.4
+- pynvml==12.0.0
+- coremltools==9.0
diff --git a/_docs/05_security/infrastructure_review.md b/_docs/05_security/infrastructure_review.md
new file mode 100644
index 0000000..1604560
--- /dev/null
+++ b/_docs/05_security/infrastructure_review.md
@@ -0,0 +1,79 @@
+# Configuration & Infrastructure Review
+
+**Date**: 2026-03-31
+**Scope**: Dockerfiles, docker-compose files, .env, .gitignore
+
+## Container Security
+
+### Dockerfile (CPU)
+
+| Check | Status | Detail |
+|-------|--------|--------|
+| Non-root user | FAIL | Runs as root (no USER directive) |
+| Minimal base image | PASS | Uses `python:3.11-slim` |
+| No secrets in build args | PASS | No ARG with secrets |
+| apt cache cleaned | PASS | `rm -rf /var/lib/apt/lists/*` |
+| No-cache pip install | PASS | `--no-cache-dir` |
+| Health check | FAIL | No HEALTHCHECK directive |
+
+### Dockerfile.gpu
+
+| Check | Status | Detail |
+|-------|--------|--------|
+| Non-root user | FAIL | Runs as root (no USER directive) |
+| Minimal base image | WARN | Uses `nvidia/cuda:12.2.0-runtime-ubuntu22.04` (necessary for GPU, but large) |
+| No secrets in build args | PASS | No ARG with secrets |
+| apt cache cleaned | PASS | `rm -rf /var/lib/apt/lists/*` |
+| No-cache pip install | PASS | `--no-cache-dir` |
+| Health check | FAIL | No HEALTHCHECK directive |
+
+### Remediation
+
+Add to both Dockerfiles:
+```dockerfile
+RUN adduser --disabled-password --gecos '' appuser
+USER appuser
+HEALTHCHECK --interval=30s --timeout=5s CMD curl -f http://localhost:8080/health || exit 1
+```
+
+## CI/CD Security
+
+No CI/CD pipeline files found in the repository (no `.github/workflows/`, `.gitlab-ci.yml`, `azure-pipelines.yml`, or `Jenkinsfile`). CI/CD security cannot be assessed.
+
+**Recommendation**: When CI/CD is added, include dependency scanning, SAST, secret scanning, and image scanning steps.
+
+## Environment Configuration
+
+| Check | Status | Detail |
+|-------|--------|--------|
+| .env handling | PASS | `.env` is gitignored (root level); `e2e/.env` is tracked but contains only `COMPOSE_PROFILES=cpu` (no secrets) |
+| Secrets in docker-compose | PASS | No credentials in compose files; service URLs are internal Docker network names |
+| Environment separation | PASS | URLs are configurable via env vars (`LOADER_URL`, `ANNOTATIONS_URL`, `VIDEOS_DIR`, `IMAGES_DIR`) |
+| Secret management | N/A | No secrets required by this service (tokens come from HTTP headers) |
+
+## Network Security
+
+| Check | Status | Detail |
+|-------|--------|--------|
+| Exposed ports | WARN | Port 8080 exposed; relies on external network controls for access restriction |
+| TLS configuration | FAIL | No TLS termination in the application; `CMD` runs uvicorn without `--ssl-*` flags |
+| CORS | WARN | No CORSMiddleware configured — browser clients cannot make cross-origin requests (may be intentional if behind API gateway) |
+| Security headers | FAIL | No security headers middleware (see SAST findings) |
+
+## .gitignore Review
+
+| Check | Status | Detail |
+|-------|--------|--------|
+| .env files excluded | PASS | `.env`, `.env.*` patterns in .gitignore |
+| Credentials excluded | PASS | `.cursor/mcp.json` excluded |
+| Binary files excluded | PASS | `.onnx`, media formats excluded |
+| Build artifacts excluded | PASS | `build/`, `dist/`, `*.so`, `*.egg-info/` excluded |
+
+## Summary
+
+| Severity | Count |
+|----------|-------|
+| Critical | 0     |
+| High     | 0     |
+| Medium   | 3 (root containers x2, no TLS) |
+| Low      | 3 (no healthcheck x2, no CORS config) |
diff --git a/_docs/05_security/owasp_review.md b/_docs/05_security/owasp_review.md
new file mode 100644
index 0000000..65c70a3
--- /dev/null
+++ b/_docs/05_security/owasp_review.md
@@ -0,0 +1,67 @@
+# OWASP Top 10 Review
+
+**Date**: 2026-03-31
+**OWASP Version**: 2025 (8th edition)
+**Scope**: Full codebase
+
+## Assessment
+
+| # | Category | Status | Findings |
+|---|----------|--------|----------|
+| A01 | Broken Access Control | FAIL | 3 |
+| A02 | Security Misconfiguration | FAIL | 2 |
+| A03 | Software Supply Chain Failures | FAIL | 2 |
+| A04 | Cryptographic Failures | FAIL | 1 |
+| A05 | Injection | PASS | — |
+| A06 | Insecure Design | FAIL | 2 |
+| A07 | Authentication Failures | FAIL | 1 |
+| A08 | Software or Data Integrity Failures | PASS | — |
+| A09 | Logging & Alerting Failures | FAIL | 1 |
+| A10 | Mishandling of Exceptional Conditions | FAIL | 1 |
+
+## Category Details
+
+### A01: Broken Access Control — FAIL
+
+1. **No authentication required**: All endpoints are publicly accessible. Bearer tokens are optional. Any network-reachable client can trigger inference, access SSE stream, and consume resources.
+2. **SSE stream leaks cross-user data**: `/detect/stream` broadcasts all detection events to all connected clients regardless of identity. Horizontal privilege escalation — any user sees every other user's results.
+3. **No authorization on media operations**: `/detect/{media_id}` reads any media file that the annotations service returns a path for. No check that the requesting user owns that media.
+
+### A02: Security Misconfiguration — FAIL
+
+1. **Docker containers run as root**: Both `Dockerfile` and `Dockerfile.gpu` do not create or switch to a non-root user. Container compromise gives root filesystem access.
+2. **No security headers**: The FastAPI app does not set standard security headers (X-Content-Type-Options, X-Frame-Options, Strict-Transport-Security, Content-Security-Policy).
+
+### A03: Software Supply Chain Failures — FAIL
+
+1. **Unpinned dependencies**: `fastapi`, `uvicorn[standard]`, `python-multipart` are unpinned in `requirements.txt`. A compromised PyPI upload could be silently pulled during build.
+2. **Known CVEs in transitive dependencies**: h11 (via uvicorn) has CVE-2025-43859 (CRITICAL — HTTP request smuggling); python-multipart has CVE-2026-28356 (HIGH — ReDoS).
+
+### A04: Cryptographic Failures — FAIL
+
+1. **JWT tokens not cryptographically verified**: `TokenManager._decode_exp()` and `decode_user_id()` decode JWT payloads via base64 without signature verification. Forged tokens are accepted as valid.
+
+### A05: Injection — PASS
+
+No SQL, command injection, XSS, LDAP, or template injection patterns found. JSON deserialization uses `json.loads()` which is safe. User input does not reach shell commands or query constructors.
+
+### A06: Insecure Design — FAIL
+
+1. **No rate limiting**: Compute-intensive `/detect` endpoint has no rate limiting. With only 2 ThreadPoolExecutor workers, a small number of requests can exhaust inference capacity.
+2. **No input size validation**: Uploaded files are fully read into memory (`await file.read()`) without size checks. Memory exhaustion possible with large payloads.
+
+### A07: Authentication Failures — FAIL
+
+1. **JWT token refresh silently fails**: `TokenManager._refresh()` catches all exceptions and passes silently. Failed refreshes go undetected, potentially allowing expired tokens to be used indefinitely (until the next decode check).
+
+### A08: Software or Data Integrity Failures — PASS
+
+No auto-update mechanisms, no CI/CD pipeline in the repo, no unsigned artifact consumption. Model files are loaded from a trusted internal loader service.
+
+### A09: Logging & Alerting Failures — FAIL
+
+1. **Security events not logged**: Authentication failures, token refresh failures, and unauthorized access attempts are silently swallowed (`except Exception: pass`). No audit trail for security-relevant events.
+
+### A10: Mishandling of Exceptional Conditions — FAIL
+
+1. **Silent exception swallowing**: Multiple `except Exception: pass` blocks (token refresh, annotation posting) hide failures. The system "fails open" — errors are ignored and processing continues, potentially in an inconsistent security state.
diff --git a/_docs/05_security/security_report.md b/_docs/05_security/security_report.md
new file mode 100644
index 0000000..3bb77b6
--- /dev/null
+++ b/_docs/05_security/security_report.md
@@ -0,0 +1,140 @@
+# Security Audit Report
+
+**Date**: 2026-03-31
+**Scope**: Azaion.Detections (full codebase)
+**Verdict**: FAIL
+
+## Summary
+
+| Severity | Count |
+|----------|-------|
+| Critical | 1     |
+| High     | 3     |
+| Medium   | 5     |
+| Low      | 5     |
+
+## OWASP Top 10 Assessment
+
+| Category | Status | Findings |
+|----------|--------|----------|
+| A01 Broken Access Control | FAIL | 3 |
+| A02 Security Misconfiguration | FAIL | 2 |
+| A03 Software Supply Chain Failures | FAIL | 2 |
+| A04 Cryptographic Failures | FAIL | 1 |
+| A05 Injection | PASS | — |
+| A06 Insecure Design | FAIL | 2 |
+| A07 Authentication Failures | FAIL | 1 |
+| A08 Software or Data Integrity Failures | PASS | — |
+| A09 Logging & Alerting Failures | FAIL | 1 |
+| A10 Mishandling of Exceptional Conditions | FAIL | 1 |
+
+## Findings
+
+| # | Severity | Category | Location | Title |
+|---|----------|----------|----------|-------|
+| 1 | Critical | A03 Supply Chain | requirements.txt (uvicorn→h11) | HTTP request smuggling via h11 CVE-2025-43859 |
+| 2 | High | A04 Crypto | src/main.py:67-99 | JWT decoded without signature verification |
+| 3 | High | A01 Access Control | src/main.py (all routes) | No authentication required on any endpoint |
+| 4 | High | A03 Supply Chain | requirements.txt (python-multipart) | ReDoS via python-multipart CVE-2026-28356 |
+| 5 | Medium | A01 Access Control | src/main.py:608-627 | SSE stream broadcasts cross-user data |
+| 6 | Medium | A06 Insecure Design | src/main.py:348-469 | No rate limiting on inference endpoints |
+| 7 | Medium | A02 Misconfig | Dockerfile, Dockerfile.gpu | Containers run as root |
+| 8 | Medium | A03 Supply Chain | requirements.txt | Unpinned critical dependencies |
+| 9 | Medium | A02 Misconfig | Dockerfile, Dockerfile.gpu | No TLS and no security headers |
+| 10 | Low | A06 Insecure Design | src/main.py:357 | No request body size limit |
+| 11 | Low | A10 Exceptions | src/main.py:63,490 | Silent exception swallowing |
+| 12 | Low | A09 Logging | src/main.py | Security events not logged |
+| 13 | Low | A01 Access Control | src/main.py:449-450 | Exception details leaked in responses |
+| 14 | Low | A07 Auth | src/main.py:54-64 | Token refresh failure silently ignored |
+
+### Finding Details
+
+**F1: HTTP Request Smuggling via h11** (Critical / A03)
+- Location: `requirements.txt` — unpinned `uvicorn[standard]` pulls `h11-0.14.0`
+- Description: CVE-2025-43859 (CVSS 9.1). Lenient parsing of chunked-coding line terminators enables HTTP request smuggling.
+- Impact: Bypass security controls, cache poisoning, session hijacking, data leakage
+- Remediation: Pin `h11>=0.15.0` in requirements.txt
+
+**F2: JWT Decoded Without Signature Verification** (High / A04)
+- Location: `src/main.py:67-99` (`TokenManager._decode_exp`, `decode_user_id`)
+- Description: JWT payloads are base64-decoded without cryptographic signature verification. Any client can forge tokens with arbitrary claims.
+- Impact: Full user impersonation — attacker crafts JWT with target's user ID to access their AI settings, post annotations under their account
+- Remediation: Use PyJWT with signature verification against the issuer's public key
+
+**F3: No Authentication on Endpoints** (High / A01)
+- Location: `src/main.py` — all route handlers
+- Description: All endpoints are publicly accessible. Bearer tokens are optional.
+- Impact: Unauthorized inference triggering, resource exhaustion, unauthorized access to SSE event stream
+- Remediation: Add FastAPI dependency injection for auth middleware on `/detect`, `/detect/{media_id}`, `/detect/stream`
+
+**F4: python-multipart ReDoS** (High / A03)
+- Location: `requirements.txt` — unpinned `python-multipart`
+- Description: CVE-2026-28356 (CVSS 7.5). `parse_options_header()` regex causes exponential backtracking on malicious headers.
+- Impact: Denial of service
+- Remediation: Pin `python-multipart>=1.3.1`
+
+**F5: SSE Stream Cross-User Data Leak** (Medium / A01)
+- Location: `src/main.py:608-627`
+- Description: All detection events broadcast to all connected SSE clients without filtering.
+- Impact: Any client sees all users' detection results (media IDs, coordinates, status)
+- Remediation: Associate SSE queues with authenticated users; filter events by ownership
+
+**F6: No Rate Limiting** (Medium / A06)
+- Location: `src/main.py:348-469`, `src/main.py:494-605`
+- Description: No rate limiting on compute-intensive inference endpoints.
+- Impact: DoS via inference exhaustion (2 worker threads)
+- Remediation: Add slowapi or similar rate limiting middleware
+
+**F7: Docker Containers Run as Root** (Medium / A02)
+- Location: `Dockerfile:10`, `Dockerfile.gpu:10`
+- Description: No USER directive; processes run as root inside containers.
+- Impact: Container escape or compromise gives root filesystem access
+- Remediation: Add non-root user (`adduser --disabled-password appuser && USER appuser`)
+
+**F8: Unpinned Critical Dependencies** (Medium / A03)
+- Location: `requirements.txt`
+- Description: `fastapi`, `uvicorn[standard]`, `python-multipart` are unpinned.
+- Impact: Supply chain attack via compromised PyPI package; inconsistent builds across environments
+- Remediation: Pin all dependencies to specific versions
+
+**F9: No TLS / No Security Headers** (Medium / A02)
+- Location: `Dockerfile` CMD, `src/main.py` (app setup)
+- Description: Uvicorn runs without TLS. No security headers middleware.
+- Impact: Data in transit is unencrypted; missing browser security protections
+- Remediation: Terminate TLS at reverse proxy or add `--ssl-*` flags; add security headers middleware
+
+**F10-F14**: Low severity findings (request size limits, exception handling, logging gaps) documented in `static_analysis.md` and `owasp_review.md`.
+
+## Dependency Vulnerabilities
+
+| Package | CVE | Severity | Fix Version |
+|---------|-----|----------|-------------|
+| h11 (via uvicorn) | CVE-2025-43859 | Critical | h11>=0.15.0 |
+| python-multipart | CVE-2026-28356 | High | >=1.3.1 |
+| opencv-python | — | Low (outdated) | 4.13.0.92 |
+
+## Recommendations
+
+### Immediate (Critical/High)
+
+1. **Pin h11>=0.15.0** to fix HTTP request smuggling vulnerability
+2. **Pin python-multipart>=1.3.1** to fix ReDoS vulnerability
+3. **Pin all dependencies** to specific versions in requirements.txt
+4. **Add JWT signature verification** using PyJWT with the issuer's public key
+5. **Add authentication middleware** requiring valid tokens on /detect, /detect/{media_id}, /detect/stream
+
+### Short-term (Medium)
+
+6. **Filter SSE events by user** — associate queues with authenticated sessions
+7. **Add rate limiting** on inference endpoints (slowapi or nginx rate limiting)
+8. **Run containers as non-root** — add USER directive to Dockerfiles
+9. **Add security headers** middleware (X-Content-Type-Options, X-Frame-Options, HSTS)
+10. **Configure TLS** at reverse proxy level or add Dockerfile HEALTHCHECK
+
+### Long-term (Low / Hardening)
+
+11. **Add request body size limits** via uvicorn config or middleware
+12. **Log security events** — authentication failures, token refresh failures, rate limit hits
+13. **Replace silent exception handling** with proper error logging
+14. **Set up CI/CD** with dependency scanning, SAST, and secret scanning
+15. **Add CORS configuration** if browser clients will access the API directly
diff --git a/_docs/05_security/static_analysis.md b/_docs/05_security/static_analysis.md
new file mode 100644
index 0000000..f6751c5
--- /dev/null
+++ b/_docs/05_security/static_analysis.md
@@ -0,0 +1,72 @@
+# Static Analysis (SAST)
+
+**Date**: 2026-03-31
+**Scope**: `src/`, `e2e/`, root config files
+**Method**: Manual code review with pattern matching
+
+## Findings
+
+### F1: JWT Tokens Decoded Without Signature Verification (HIGH)
+
+- **Location**: `src/main.py:67-99` (`TokenManager._decode_exp`, `TokenManager.decode_user_id`)
+- **Description**: JWT payloads are base64-decoded and parsed directly without cryptographic signature verification. Any attacker can forge a JWT with arbitrary claims (user ID, expiration).
+- **Impact**: An attacker can impersonate any user by crafting a JWT with a target's `sub`/`userId` claim, then use that to read their AI settings or post annotations under their account.
+- **Remediation**: Verify JWT signatures using the issuer's public key or shared secret before trusting claims. Use a library like `PyJWT` with `algorithms` and `verify=True`.
+
+### F2: No Authentication Required on Any Endpoint (HIGH)
+
+- **Location**: `src/main.py:325-627` (all route handlers)
+- **Description**: All endpoints (`/detect`, `/detect/{media_id}`, `/detect/stream`, `/health`) are publicly accessible. Bearer tokens are optional — requests without tokens proceed normally with reduced functionality.
+- **Impact**: Any network-reachable client can trigger AI inference (expensive compute), read the SSE event stream (all detection events for all users), and consume server resources.
+- **Remediation**: Add authentication middleware requiring valid tokens on `/detect`, `/detect/{media_id}`, and `/detect/stream`. Keep `/health` public.
+
+### F3: SSE Event Stream Broadcasts to All Clients (MEDIUM)
+
+- **Location**: `src/main.py:608-627` (`detect_stream`)
+- **Description**: The `/detect/stream` SSE endpoint broadcasts all detection events (including `mediaId` and annotations) to every connected client via a shared `_event_queues` list. There is no per-user filtering or authentication.
+- **Impact**: Any connected client receives detection results for all users, leaking media IDs, detection coordinates, and processing status of other users' media.
+- **Remediation**: Associate each SSE queue with an authenticated user and filter events by user ownership.
+
+### F4: No Rate Limiting on Compute-Intensive Endpoints (MEDIUM)
+
+- **Location**: `src/main.py:348-469` (`/detect`), `src/main.py:494-605` (`/detect/{media_id}`)
+- **Description**: No rate limiting or throttling on endpoints that trigger AI inference. The ThreadPoolExecutor has only 2 workers, making it easy to exhaust.
+- **Impact**: An attacker can repeatedly call `/detect` with valid images to consume all inference capacity, causing denial of service for legitimate users.
+- **Remediation**: Add rate limiting middleware (e.g., `slowapi`) on inference endpoints.
+
+### F5: Exception Details Leaked in HTTP Responses (LOW)
+
+- **Location**: `src/main.py:449-450`
+- **Description**: `RuntimeError` and `ValueError` messages are passed directly to `HTTPException(detail=str(e))`. Internal error messages could reveal implementation details.
+- **Impact**: Information disclosure of internal error messages to external clients.
+- **Remediation**: Return generic error messages to clients; log detailed errors server-side only.
+
+### F6: Silent Exception Swallowing (LOW)
+
+- **Location**: `src/main.py:63-64` (`_refresh`), `src/main.py:490-491` (`_post_annotation_to_service`)
+- **Description**: Multiple `except Exception: pass` blocks silently ignore errors, including potential security-relevant failures like token refresh failures or annotation posting failures.
+- **Impact**: Security-relevant events (failed auth refresh, failed API calls) go undetected, making incident investigation difficult.
+- **Remediation**: Log exceptions at WARNING level instead of silently swallowing.
+
+### F7: No Request Body Size Limit (LOW)
+
+- **Location**: `src/main.py:348-469` (`/detect` upload endpoint)
+- **Description**: No explicit limit on uploaded file size. Large uploads could exhaust memory (entire file is read into memory with `await file.read()`).
+- **Impact**: Memory exhaustion DoS via oversized uploads.
+- **Remediation**: Configure uvicorn/nginx `--limit-request-body` or check `Content-Length` before reading.
+
+## Summary
+
+| Severity | Count |
+|----------|-------|
+| Critical | 0     |
+| High     | 2     |
+| Medium   | 2     |
+| Low      | 3     |
+
+## No Issues Found
+
+- **Injection**: No SQL, command injection, XSS, or template injection patterns detected
+- **Hardcoded credentials**: No secrets, API keys, or passwords in source code
+- **Insecure deserialization**: No pickle/marshal usage
+- **File write safety**: Upload paths use content hashes with whitelisted extensions
diff --git a/_docs/_autopilot_state.md b/_docs/_autopilot_state.md
index 8d8bb3a..573e14f 100644
--- a/_docs/_autopilot_state.md
+++ b/_docs/_autopilot_state.md
@@ -2,8 +2,8 @@
 
 ## Current Step
 flow: existing-code
-step: 9
-name: Implement
-status: in_progress
+step: 13
+name: Performance Test
+status: not_started
 sub_step: 0
 retry_count: 0
diff --git a/e2e/requirements.txt b/e2e/requirements.txt
index f8d8c84..feb735c 100644
--- a/e2e/requirements.txt
+++ b/e2e/requirements.txt
@@ -5,3 +5,4 @@ sseclient-py
 pytest-timeout
 flask
 gunicorn
+httpx
diff --git a/e2e/tests/test_streaming_video_upload.py b/e2e/tests/test_streaming_video_upload.py
new file mode 100644
index 0000000..811c101
--- /dev/null
+++ b/e2e/tests/test_streaming_video_upload.py
@@ -0,0 +1,249 @@
+"""
+AZ-178: True streaming video detection — e2e test.
+
+Run with:  pytest e2e/tests/test_streaming_video_upload.py -s -v
+The -s flag is required to see real-time SSE output on the console.
+"""
+import json
+import os
+import shutil
+import subprocess
+import threading
+import time
+from pathlib import Path
+
+import pytest
+import sseclient
+
+FIXTURES_DIR = Path(__file__).resolve().parent.parent / "fixtures"
+
+
+def _fixture_path(name: str) -> str:
+    p = FIXTURES_DIR / name
+    if not p.is_file():
+        pytest.skip(f"missing fixture {p}")
+    return str(p)
+
+
+def _ensure_faststart(source_name: str, target_name: str) -> str:
+    target = FIXTURES_DIR / target_name
+    if target.is_file():
+        return str(target)
+    source = FIXTURES_DIR / source_name
+    if not source.is_file():
+        pytest.skip(f"missing source fixture {source}")
+    ffmpeg = shutil.which("ffmpeg")
+    if not ffmpeg:
+        pytest.skip("ffmpeg not found — needed to create faststart fixture")
+    subprocess.run(
+        [ffmpeg, "-y", "-i", str(source), "-c", "copy", "-movflags", "+faststart", str(target)],
+        capture_output=True, check=True,
+    )
+    return str(target)
+
+
+def _chunked_reader(path: str, chunk_size: int = 64 * 1024):
+    with open(path, "rb") as f:
+        while True:
+            chunk = f.read(chunk_size)
+            if not chunk:
+                break
+            yield chunk
+
+
+@pytest.mark.slow
+@pytest.mark.timeout(900)
+def test_streaming_video_detections_appear_during_upload(warm_engine, http_client):
+    """Upload video_1 (faststart) via POST /detect/video and print SSE events as they arrive."""
+    # Arrange
+    video_path = _ensure_faststart("video_1.mp4", "video_1_faststart.mp4")
+    file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
+
+    events_log: list[tuple[float, dict]] = []
+    thread_exc: list[BaseException] = []
+    first_detection_time: list[float] = []
+    upload_started = threading.Event()
+    done = threading.Event()
+    media_id_holder: list[str] = []
+
+    print(f"\n{'='*80}")
+    print(f"  AZ-178 STREAMING VIDEO TEST")
+    print(f"  File: video_1_faststart.mp4 ({file_size_mb:.1f} MB, faststart)")
+    print(f"{'='*80}")
+
+    def _listen_sse():
+        try:
+            with http_client.get("/detect/stream", stream=True, timeout=600) as resp:
+                resp.raise_for_status()
+                sse = sseclient.SSEClient(resp)
+                upload_started.wait(timeout=30)
+                for event in sse.events():
+                    if not event.data or not str(event.data).strip():
+                        continue
+                    data = json.loads(event.data)
+                    if media_id_holder and data.get("mediaId") != media_id_holder[0]:
+                        continue
+                    now = time.monotonic()
+                    events_log.append((now, data))
+
+                    status = data.get("mediaStatus", "?")
+                    percent = data.get("mediaPercent", 0)
+                    n_det = len(data.get("annotations", []))
+                    labels = [a["label"] for a in data.get("annotations", [])]
+
+                    if n_det > 0 and not first_detection_time:
+                        first_detection_time.append(now)
+
+                    elapsed_since_upload = ""
+                    if upload_started.is_set():
+                        elapsed_since_upload = f" (t+{now - upload_start_mono[0]:.2f}s)"
+
+                    print(
+                        f"  SSE | {status:15s} | {percent:3d}% | "
+                        f"{n_det:2d} detections | {labels}{elapsed_since_upload}"
+                    )
+
+                    if status == "AIProcessed" and percent == 100:
+                        break
+                    if status == "Error":
+                        break
+        except BaseException as e:
+            thread_exc.append(e)
+        finally:
+            done.set()
+
+    upload_start_mono: list[float] = []
+
+    # Act
+    sse_thread = threading.Thread(target=_listen_sse, daemon=True)
+    sse_thread.start()
+    time.sleep(0.5)
+
+    print(f"\n  >>> Starting upload...")
+    upload_start_mono.append(time.monotonic())
+    upload_started.set()
+
+    r = http_client.post(
+        "/detect/video",
+        data=_chunked_reader(video_path),
+        headers={
+            "X-Filename": "video_1_faststart.mp4",
+            "Content-Type": "application/octet-stream",
+        },
+        timeout=600,
+    )
+
+    upload_end = time.monotonic()
+    upload_duration = upload_end - upload_start_mono[0]
+    print(f"\n  >>> Upload complete in {upload_duration:.2f}s")
+    print(f"  >>> Response: {r.status_code} {r.json()}")
+
+    if r.status_code == 200:
+        media_id_holder.append(r.json().get("mediaId", ""))
+
+    ok = done.wait(timeout=600)
+
+    # Assert
+    print(f"\n{'='*80}")
+    print(f"  RESULTS")
+    print(f"{'='*80}")
+    print(f"  Total SSE events:        {len(events_log)}")
+    detection_events = [e for _, e in events_log if len(e.get("annotations", [])) > 0]
+    print(f"  Events with detections:  {len(detection_events)}")
+    print(f"  Upload duration:         {upload_duration:.2f}s")
+
+    if first_detection_time:
+        ttfd = first_detection_time[0] - upload_start_mono[0]
+        print(f"  Time to first detection: {ttfd:.2f}s")
+        if ttfd < upload_duration:
+            print(f"  >>> STREAMING CONFIRMED: first detection arrived {upload_duration - ttfd:.1f}s BEFORE upload finished")
+        else:
+            print(f"  >>> Detection arrived after upload (moov-at-end or slow inference)")
+    else:
+        print(f"  Time to first detection: (none)")
+
+    if events_log:
+        final = events_log[-1][1]
+        print(f"  Final status:            {final.get('mediaStatus')} ({final.get('mediaPercent')}%)")
+    print(f"{'='*80}\n")
+
+    assert not thread_exc, f"SSE thread error: {thread_exc}"
+    assert r.status_code == 200
+    assert ok, "SSE listener did not finish"
+
+
+@pytest.mark.slow
+@pytest.mark.timeout(900)
+def test_non_faststart_video_still_works(warm_engine, http_client):
+    """Upload the original video_1.mp4 (moov at end) — should still work, just not stream."""
+    # Arrange
+    video_path = _fixture_path("video_1.mp4")
+    file_size_mb = os.path.getsize(video_path) / (1024 * 1024)
+
+    events_log: list[tuple[float, dict]] = []
+    thread_exc: list[BaseException] = []
+    done = threading.Event()
+    upload_started = threading.Event()
+
+    print(f"\n{'='*80}")
+    print(f"  NON-FASTSTART FALLBACK TEST")
+    print(f"  File: video_1.mp4 ({file_size_mb:.1f} MB, moov at end)")
+    print(f"{'='*80}")
+
+    def _listen_sse():
+        try:
+            with http_client.get("/detect/stream", stream=True, timeout=600) as resp:
+                resp.raise_for_status()
+                sse = sseclient.SSEClient(resp)
+                upload_started.wait(timeout=30)
+                for event in sse.events():
+                    if not event.data or not str(event.data).strip():
+                        continue
+                    data = json.loads(event.data)
+                    now = time.monotonic()
+                    events_log.append((now, data))
+
+                    status = data.get("mediaStatus", "?")
+                    percent = data.get("mediaPercent", 0)
+                    n_det = len(data.get("annotations", []))
+
+                    print(f"  SSE | {status:15s} | {percent:3d}% | {n_det:2d} detections")
+
+                    if status in ("AIProcessed", "Error") and percent == 100:
+                        break
+        except BaseException as e:
+            thread_exc.append(e)
+        finally:
+            done.set()
+
+    # Act
+    sse_thread = threading.Thread(target=_listen_sse, daemon=True)
+    sse_thread.start()
+    time.sleep(0.5)
+
+    print(f"\n  >>> Starting upload...")
+    t0 = time.monotonic()
+    upload_started.set()
+
+    r = http_client.post(
+        "/detect/video",
+        data=_chunked_reader(video_path),
+        headers={
+            "X-Filename": "video_1.mp4",
+            "Content-Type": "application/octet-stream",
+        },
+        timeout=600,
+    )
+
+    upload_duration = time.monotonic() - t0
+    print(f"\n  >>> Upload + response in {upload_duration:.2f}s")
+    print(f"  >>> Response: {r.status_code} {r.json()}")
+
+    ok = done.wait(timeout=600)
+
+    # Assert
+    assert not thread_exc, f"SSE thread error: {thread_exc}"
+    assert r.status_code == 200
+    assert ok, "SSE listener did not finish"
+    print(f"  Total SSE events: {len(events_log)}")
+    print(f"{'='*80}\n")
diff --git a/run-tests.sh b/run-tests.sh
index 7662fda..8baaf49 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -19,6 +19,9 @@ trap cleanup EXIT
 
 PY="$(command -v python3 2>/dev/null || command -v python 2>/dev/null || echo python)"
 
+echo "Installing dependencies ..."
+"$PY" -m pip install -q -r "$ROOT/requirements.txt" -r "$ROOT/e2e/requirements.txt"
+
 echo "Building Cython extensions ..."
 "$PY" setup.py build_ext --inplace
 
diff --git a/scripts/deploy.sh b/scripts/deploy.sh
new file mode 100755
index 0000000..2a341da
--- /dev/null
+++ b/scripts/deploy.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+if [ -f "$PROJECT_ROOT/.env" ]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+REGISTRY="${REGISTRY:?REGISTRY is required}"
+IMAGE_TAG="${IMAGE_TAG:?IMAGE_TAG is required}"
+DEPLOY_HOST="${DEPLOY_HOST:-}"
+DEPLOY_USER="${DEPLOY_USER:-deploy}"
+
+usage() {
+    echo "Usage: $0 [--rollback] [--help]"
+    echo ""
+    echo "Deploy Azaion.Detections to a target machine."
+    echo ""
+    echo "Options:"
+    echo "  --rollback  Redeploy the previous image tag"
+    echo "  --help      Show this help message"
+    echo ""
+    echo "Environment variables:"
+    echo "  REGISTRY     Container registry URL (required)"
+    echo "  IMAGE_TAG    Image tag to deploy (required)"
+    echo "  DEPLOY_HOST  Target machine (empty = local)"
+    echo "  DEPLOY_USER  SSH user (default: deploy)"
+    exit 0
+}
+
+run_cmd() {
+    if [ -n "$DEPLOY_HOST" ]; then
+        ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "$@"
+    else
+        eval "$@"
+    fi
+}
+
+ROLLBACK=0
+for arg in "$@"; do
+    case "$arg" in
+        --rollback) ROLLBACK=1 ;;
+        --help) usage ;;
+    esac
+done
+
+if [ "$ROLLBACK" -eq 1 ]; then
+    PREV_TAG_FILE="$PROJECT_ROOT/.deploy-previous-tag"
+    if [ ! -f "$PREV_TAG_FILE" ]; then
+        echo "ERROR: No previous tag found at $PREV_TAG_FILE"
+        exit 1
+    fi
+    IMAGE_TAG="$(cat "$PREV_TAG_FILE")"
+    echo "Rolling back to image tag: $IMAGE_TAG"
+fi
+
+echo "=== Deploying Azaion.Detections ==="
+echo "Registry:  $REGISTRY"
+echo "Image tag: $IMAGE_TAG"
+echo "Target:    ${DEPLOY_HOST:-localhost}"
+echo ""
+
+echo "--- Pulling images ---"
+bash "$SCRIPT_DIR/pull-images.sh"
+
+echo "--- Stopping services ---"
+bash "$SCRIPT_DIR/stop-services.sh"
+
+echo "--- Starting services ---"
+bash "$SCRIPT_DIR/start-services.sh"
+
+echo "--- Health check ---"
+if bash "$SCRIPT_DIR/health-check.sh"; then
+    echo "=== Deployment successful ==="
+else
+    echo "ERROR: Health check failed. Run '$0 --rollback' to revert."
+    exit 1
+fi
diff --git a/scripts/health-check.sh b/scripts/health-check.sh
new file mode 100755
index 0000000..319927a
--- /dev/null
+++ b/scripts/health-check.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -euo pipefail
+
+HOST="${HEALTH_CHECK_HOST:-localhost}"
+PORT="${HEALTH_CHECK_PORT:-8080}"
+MAX_RETRIES=10
+RETRY_INTERVAL=3
+
+usage() {
+    echo "Usage: $0 [--help]"
+    echo "Check health of Azaion.Detections service."
+    echo ""
+    echo "Environment variables:"
+    echo "  HEALTH_CHECK_HOST  Target host (default: localhost)"
+    echo "  HEALTH_CHECK_PORT  Target port (default: 8080)"
+    exit 0
+}
+
+for arg in "$@"; do
+    case "$arg" in
+        --help) usage ;;
+    esac
+done
+
+URL="http://${HOST}:${PORT}/health"
+echo "Checking health at $URL ..."
+
+for i in $(seq 1 "$MAX_RETRIES"); do
+    if curl -sf "$URL" > /dev/null 2>&1; then
+        RESPONSE="$(curl -sf "$URL")"
+        echo "Health check passed (attempt $i/$MAX_RETRIES): $RESPONSE"
+        exit 0
+    fi
+    echo "Attempt $i/$MAX_RETRIES failed, retrying in ${RETRY_INTERVAL}s..."
+    sleep "$RETRY_INTERVAL"
+done
+
+echo "ERROR: Health check failed after $MAX_RETRIES attempts."
+exit 1
diff --git a/scripts/pull-images.sh b/scripts/pull-images.sh
new file mode 100755
index 0000000..e0d6d58
--- /dev/null
+++ b/scripts/pull-images.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+if [ -f "$PROJECT_ROOT/.env" ]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+REGISTRY="${REGISTRY:?REGISTRY is required}"
+IMAGE_TAG="${IMAGE_TAG:?IMAGE_TAG is required}"
+
+usage() {
+    echo "Usage: $0 [--help]"
+    echo "Pull Docker images for Azaion.Detections from the registry."
+    exit 0
+}
+
+for arg in "$@"; do
+    case "$arg" in
+        --help) usage ;;
+    esac
+done
+
+IMAGE="${REGISTRY}/azaion/detections-cpu:${IMAGE_TAG}"
+
+echo "Pulling $IMAGE ..."
+docker pull "$IMAGE"
+
+echo "Verifying image digest..."
+docker inspect --format='{{index .RepoDigests 0}}' "$IMAGE" 2>/dev/null || echo "Warning: digest not available for local images"
+
+echo "Pull complete."
diff --git a/scripts/start-services.sh b/scripts/start-services.sh
new file mode 100755
index 0000000..a805805
--- /dev/null
+++ b/scripts/start-services.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+if [ -f "$PROJECT_ROOT/.env" ]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+REGISTRY="${REGISTRY:?REGISTRY is required}"
+IMAGE_TAG="${IMAGE_TAG:?IMAGE_TAG is required}"
+LOADER_URL="${LOADER_URL:-http://loader:8080}"
+ANNOTATIONS_URL="${ANNOTATIONS_URL:-http://annotations:8080}"
+
+usage() {
+    echo "Usage: $0 [--help]"
+    echo "Start Azaion.Detections service container."
+    exit 0
+}
+
+for arg in "$@"; do
+    case "$arg" in
+        --help) usage ;;
+    esac
+done
+
+IMAGE="${REGISTRY}/azaion/detections-cpu:${IMAGE_TAG}"
+CONTAINER_NAME="azaion-detections"
+
+echo "Starting $CONTAINER_NAME from $IMAGE ..."
+
+docker run -d \
+    --name "$CONTAINER_NAME" \
+    --restart unless-stopped \
+    -p 8080:8080 \
+    -e LOADER_URL="$LOADER_URL" \
+    -e ANNOTATIONS_URL="$ANNOTATIONS_URL" \
+    -v "$(pwd)/classes.json:/app/classes.json:ro" \
+    --shm-size=512m \
+    --memory=4g \
+    "$IMAGE"
+
+echo "Container $CONTAINER_NAME started."
diff --git a/scripts/stop-services.sh b/scripts/stop-services.sh
new file mode 100755
index 0000000..920c707
--- /dev/null
+++ b/scripts/stop-services.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+CONTAINER_NAME="azaion-detections"
+GRACE_PERIOD=30
+
+usage() {
+    echo "Usage: $0 [--help]"
+    echo "Gracefully stop Azaion.Detections service container."
+    exit 0
+}
+
+for arg in "$@"; do
+    case "$arg" in
+        --help) usage ;;
+    esac
+done
+
+if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
+    CURRENT_IMAGE="$(docker inspect --format='{{.Config.Image}}' "$CONTAINER_NAME" 2>/dev/null || true)"
+    if [ -n "$CURRENT_IMAGE" ]; then
+        CURRENT_TAG="${CURRENT_IMAGE##*:}"
+        echo "$CURRENT_TAG" > "$PROJECT_ROOT/.deploy-previous-tag"
+        echo "Saved previous tag: $CURRENT_TAG"
+    fi
+
+    echo "Stopping $CONTAINER_NAME (${GRACE_PERIOD}s grace period)..."
+    docker stop -t "$GRACE_PERIOD" "$CONTAINER_NAME" || true
+    docker rm "$CONTAINER_NAME" 2>/dev/null || true
+    echo "Stopped and removed $CONTAINER_NAME."
+else
+    echo "Container $CONTAINER_NAME is not running."
+fi
+
+docker container prune -f --filter "label=com.docker.compose.project=azaion-detections" 2>/dev/null || true
diff --git a/src/inference.pyx b/src/inference.pyx
index de12085..f11581e 100644
--- a/src/inference.pyx
+++ b/src/inference.pyx
@@ -199,6 +199,25 @@ cdef class Inference:
             writer_done.wait()
             wt.join(timeout=3600)
 
+    cpdef run_detect_video_stream(self, object readable, AIRecognitionConfig ai_config, str media_name,
+                                  object annotation_callback, object status_callback=None):
+        cdef str original_media_name
+        self._annotation_callback = annotation_callback
+        self._status_callback = status_callback
+        self.stop_signal = <bint>False
+        self.init_ai()
+        if self.engine is None:
+            constants_inf.log(<str> "AI engine not available. Conversion may be in progress. Skipping inference.")
+            return
+        original_media_name = media_name.replace(" ", "")
+        self.detection_counts = {}
+        self.detection_counts[original_media_name] = 0
+        container = av.open(readable)
+        try:
+            self._process_video_pyav(ai_config, original_media_name, container)
+        finally:
+            container.close()
+
     cdef _process_video_pyav(self, AIRecognitionConfig ai_config, str original_media_name, object container):
         cdef int frame_count = 0
         cdef int batch_count = 0
diff --git a/src/main.py b/src/main.py
index e31330b..205c9b3 100644
--- a/src/main.py
+++ b/src/main.py
@@ -467,6 +467,138 @@ async def detect_image(
                 pass
 
 
+@app.post("/detect/video")
+async def detect_video_upload(request: Request):
+    from media_hash import compute_media_content_hash_from_file
+    from inference import ai_config_from_dict
+    from streaming_buffer import StreamingBuffer
+
+    filename = request.headers.get("x-filename", "upload.mp4")
+    config_json = request.headers.get("x-config", "")
+    ext = _normalize_upload_ext(filename)
+    if ext not in _VIDEO_EXTENSIONS:
+        raise HTTPException(status_code=400, detail="Expected a video file extension")
+
+    config_dict = json.loads(config_json) if config_json else {}
+    ai_cfg = ai_config_from_dict(config_dict)
+
+    auth_header = request.headers.get("authorization", "")
+    access_token = auth_header.removeprefix("Bearer ").strip() if auth_header else ""
+    refresh_token = request.headers.get("x-refresh-token", "")
+    token_mgr = TokenManager(access_token, refresh_token) if access_token else None
+    user_id = TokenManager.decode_user_id(access_token) if access_token else None
+
+    videos_dir = os.environ.get(
+        "VIDEOS_DIR", os.path.join(os.getcwd(), "data", "videos")
+    )
+    os.makedirs(videos_dir, exist_ok=True)
+    content_length = request.headers.get("content-length")
+    total_size = int(content_length) if content_length else None
+    buffer = StreamingBuffer(videos_dir, total_size=total_size)
+    media_name = Path(filename).stem.replace(" ", "")
+    loop = asyncio.get_event_loop()
+    inf = get_inference()
+
+    def _enqueue(event):
+        for q in _event_queues:
+            try:
+                q.put_nowait(event)
+            except asyncio.QueueFull:
+                pass
+
+    placeholder_id = f"tmp_{os.path.basename(buffer.path)}"
+
+    def on_annotation(annotation, percent):
+        dtos = [detection_to_dto(d) for d in annotation.detections]
+        event = DetectionEvent(
+            annotations=dtos,
+            mediaId=placeholder_id,
+            mediaStatus="AIProcessing",
+            mediaPercent=percent,
+        )
+        loop.call_soon_threadsafe(_enqueue, event)
+
+    def on_status(media_name_cb, count):
+        event = DetectionEvent(
+            annotations=[],
+            mediaId=placeholder_id,
+            mediaStatus="AIProcessed",
+            mediaPercent=100,
+        )
+        loop.call_soon_threadsafe(_enqueue, event)
+
+    def run_inference():
+        inf.run_detect_video_stream(buffer, ai_cfg, media_name, on_annotation, on_status)
+
+    inference_future = loop.run_in_executor(executor, run_inference)
+
+    try:
+        async for chunk in request.stream():
+            await loop.run_in_executor(None, buffer.append, chunk)
+    except Exception:
+        buffer.close_writer()
+        buffer.close()
+        raise
+    buffer.close_writer()
+
+    content_hash = compute_media_content_hash_from_file(buffer.path)
+    if not ext.startswith("."):
+        ext = "." + ext
+    storage_path = os.path.abspath(os.path.join(videos_dir, f"{content_hash}{ext}"))
+
+    if token_mgr and user_id:
+        os.rename(buffer.path, storage_path)
+        payload = {
+            "id": content_hash,
+            "name": Path(filename).name,
+            "path": storage_path,
+            "mediaType": "Video",
+            "mediaStatus": _MEDIA_STATUS_NEW,
+            "userId": user_id,
+        }
+        bearer = token_mgr.get_valid_token()
+        _post_media_record(payload, bearer)
+        _put_media_status(content_hash, _MEDIA_STATUS_AI_PROCESSING, bearer)
+
+    async def _wait_inference():
+        try:
+            await inference_future
+            if token_mgr and user_id:
+                _put_media_status(
+                    content_hash, _MEDIA_STATUS_AI_PROCESSED,
+                    token_mgr.get_valid_token(),
+                )
+            done_event = DetectionEvent(
+                annotations=[],
+                mediaId=content_hash,
+                mediaStatus="AIProcessed",
+                mediaPercent=100,
+            )
+            _enqueue(done_event)
+        except Exception:
+            if token_mgr and user_id:
+                _put_media_status(
+                    content_hash, _MEDIA_STATUS_ERROR,
+                    token_mgr.get_valid_token(),
+                )
+            err_event = DetectionEvent(
+                annotations=[], mediaId=content_hash,
+                mediaStatus="Error", mediaPercent=0,
+            )
+            _enqueue(err_event)
+        finally:
+            _active_detections.pop(content_hash, None)
+            buffer.close()
+            if not (token_mgr and user_id) and os.path.isfile(buffer.path):
+                try:
+                    os.unlink(buffer.path)
+                except OSError:
+                    pass
+
+    _active_detections[content_hash] = asyncio.create_task(_wait_inference())
+    return {"status": "started", "mediaId": content_hash}
+
+
 def _post_annotation_to_service(token_mgr: TokenManager, media_id: str,
                                  annotation, dtos: list[DetectionDto]):
     try:
diff --git a/src/media_hash.py b/src/media_hash.py
index 262c05e..a9b12af 100644
--- a/src/media_hash.py
+++ b/src/media_hash.py
@@ -1,3 +1,5 @@
+import os
+
 import xxhash
 
 
@@ -14,3 +16,21 @@ def compute_media_content_hash(data: bytes, virtual: bool = False) -> str:
     payload = _sampling_payload(data)
     h = xxhash.xxh64(payload).hexdigest()
     return f"V{h}" if virtual else h
+
+
+def compute_media_content_hash_from_file(path: str, virtual: bool = False) -> str:
+    n = os.path.getsize(path)
+    size_bytes = n.to_bytes(8, "little")
+    with open(path, "rb") as f:
+        if n < 3072:
+            payload = size_bytes + f.read()
+        else:
+            first = f.read(1024)
+            mid = (n - 1024) // 2
+            f.seek(mid)
+            middle = f.read(1024)
+            f.seek(-1024, 2)
+            last = f.read(1024)
+            payload = size_bytes + first + middle + last
+    h = xxhash.xxh64(payload).hexdigest()
+    return f"V{h}" if virtual else h
diff --git a/src/streaming_buffer.py b/src/streaming_buffer.py
new file mode 100644
index 0000000..6d3902a
--- /dev/null
+++ b/src/streaming_buffer.py
@@ -0,0 +1,93 @@
+import os
+import tempfile
+import threading
+
+
+class StreamingBuffer:
+    def __init__(self, temp_dir: str | None = None, total_size: int | None = None):
+        fd, self.path = tempfile.mkstemp(dir=temp_dir, suffix=".tmp")
+        os.close(fd)
+        self._writer = open(self.path, "wb")
+        self._reader = open(self.path, "rb")
+        self._written = 0
+        self._total_size = total_size
+        self._eof = False
+        self._cond = threading.Condition()
+
+    def append(self, data: bytes) -> None:
+        with self._cond:
+            self._writer.write(data)
+            self._writer.flush()
+            self._written += len(data)
+            self._cond.notify_all()
+
+    def close_writer(self) -> None:
+        with self._cond:
+            self._writer.close()
+            self._eof = True
+            self._cond.notify_all()
+
+    def read(self, size: int = -1) -> bytes:
+        with self._cond:
+            pos = self._reader.tell()
+            if size < 0:
+                while not self._eof:
+                    self._cond.wait()
+                to_read = self._written - pos
+            else:
+                while self._written <= pos and not self._eof:
+                    self._cond.wait()
+                available = self._written - pos
+                if available <= 0:
+                    return b""
+                to_read = min(size, available)
+        return self._reader.read(to_read)
+
+    def seek(self, offset: int, whence: int = 0) -> int:
+        with self._cond:
+            if whence == 2:
+                if self._total_size is not None:
+                    target = self._total_size + offset
+                    self._reader.seek(target, 0)
+                    return target
+                while not self._eof:
+                    self._cond.wait()
+                return self._reader.seek(offset, 2)
+            if whence == 1:
+                target = self._reader.tell() + offset
+            else:
+                target = offset
+            if target <= self._written:
+                return self._reader.seek(offset, whence)
+            if self._total_size is not None:
+                self._reader.seek(target, 0)
+                return target
+            while target > self._written and not self._eof:
+                self._cond.wait()
+            return self._reader.seek(offset, whence)
+
+    def tell(self) -> int:
+        return self._reader.tell()
+
+    def readable(self) -> bool:
+        return True
+
+    def seekable(self) -> bool:
+        return True
+
+    def writable(self) -> bool:
+        return False
+
+    def close(self) -> None:
+        try:
+            self._reader.close()
+        except Exception:
+            pass
+        try:
+            self._writer.close()
+        except Exception:
+            pass
+
+    @property
+    def written(self) -> int:
+        return self._written
diff --git a/tests/test_az178_realvideo_streaming.py b/tests/test_az178_realvideo_streaming.py
new file mode 100644
index 0000000..c1f1826
--- /dev/null
+++ b/tests/test_az178_realvideo_streaming.py
@@ -0,0 +1,153 @@
+"""
+AZ-178: Streaming video detection with real AI inference.
+Uses video_1_faststart.mp4. Stops after 10 seconds.
+
+Requires services (run via run-tests.sh) for model download.
+Run:  sh run-tests.sh -k test_frames_decoded
+"""
+import os
+import threading
+import time
+from pathlib import Path
+
+import pytest
+
+FIXTURES_DIR = Path(__file__).resolve().parent.parent / "e2e" / "fixtures"
+FASTSTART_PATH = FIXTURES_DIR / "video_1_faststart.mp4"
+
+
+@pytest.fixture(scope="module")
+def faststart_video():
+    if FASTSTART_PATH.is_file():
+        return str(FASTSTART_PATH)
+    source = FIXTURES_DIR / "video_1.mp4"
+    if not source.is_file():
+        pytest.skip(f"missing source fixture {source}")
+    import shutil
+    import subprocess
+    ffmpeg = shutil.which("ffmpeg")
+    if not ffmpeg:
+        pytest.skip("ffmpeg not found")
+    subprocess.run(
+        [ffmpeg, "-y", "-i", str(source), "-c", "copy", "-movflags", "+faststart", str(FASTSTART_PATH)],
+        capture_output=True, check=True,
+    )
+    return str(FASTSTART_PATH)
+
+
+def test_frames_decoded_while_upload_in_progress(faststart_video):
+    from streaming_buffer import StreamingBuffer
+
+    loader_url = os.environ.get("LOADER_URL") or os.environ.get("MOCK_LOADER_URL")
+    if not loader_url:
+        pytest.skip("LOADER_URL/MOCK_LOADER_URL not set — run via run-tests.sh for real detections")
+
+    from inference import Inference, ai_config_from_dict
+    from loader_http_client import LoaderHttpClient
+
+    client = LoaderHttpClient(loader_url)
+    inf = Inference(client)
+    if not inf.is_engine_ready:
+        pytest.skip("AI engine not available (model download failed)")
+
+    # Arrange
+    ai_cfg = ai_config_from_dict({})
+    file_size = os.path.getsize(faststart_video)
+    chunk_size = 64 * 1024
+    buf = StreamingBuffer(total_size=file_size)
+
+    bytes_written = [0]
+    stop_flag = threading.Event()
+    writer_start = [0.0]
+    detections_log = []
+    first_det_time = []
+    inf_error = []
+
+    from constants_inf import get_annotation_name
+
+    def on_annotation(annotation, percent):
+        now = time.monotonic()
+        if not first_det_time:
+            first_det_time.append(now)
+        written_mb = bytes_written[0] / (1024 * 1024)
+        pct_file = bytes_written[0] * 100 / file_size
+        elapsed = now - writer_start[0]
+        det_strs = [
+            f"{get_annotation_name(d.cls)}:{d.confidence*100:.0f}% @({d.x:.3f},{d.y:.3f} {d.w:.3f}x{d.h:.3f})"
+            for d in annotation.detections
+        ]
+        detections_log.append((now, annotation, percent))
+        print(f"  DET | {elapsed:7.2f}s | {written_mb:8.1f} MB | {pct_file:5.1f}% file | "
+              f"{percent:3d}% video | {len(annotation.detections)} dets | {det_strs}")
+
+    def on_status(media_name, count):
+        print(f"  STATUS | {media_name}: {count} total detections")
+
+    def writer():
+        writer_start[0] = time.monotonic()
+        with open(faststart_video, "rb") as f:
+            while not stop_flag.is_set():
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                buf.append(chunk)
+                bytes_written[0] += len(chunk)
+                time.sleep(0.001)
+        buf.close_writer()
+
+    def run_inference():
+        try:
+            inf.run_detect_video_stream(buf, ai_cfg, "streaming_test", on_annotation, on_status)
+        except Exception as e:
+            inf_error.append(e)
+
+    print(f"\n  Video: {file_size/(1024*1024):.1f} MB (faststart)")
+    print(f"  {'':>6s}   {'Time':>8s}   {'Written':>10s}   {'% File':>7s}   {'% Vid':>5s}   {'Dets':>4s}   Labels")
+    print(f"  {'-'*80}")
+
+    # Act
+    wt = threading.Thread(target=writer, daemon=True)
+    wt.start()
+
+    inf_thread = threading.Thread(target=run_inference, daemon=True)
+    inf_thread.start()
+
+    inf_thread.join(timeout=10.0)
+
+    inf.stop()
+    stop_flag.set()
+    buf.close_writer()
+    wt.join(timeout=5)
+    inf_thread.join(timeout=5)
+
+    try:
+        buf.close()
+        os.unlink(buf.path)
+    except Exception:
+        pass
+
+    # Assert
+    written_mb = bytes_written[0] / (1024 * 1024)
+    print(f"\n  {'='*60}")
+    print(f"  RESULTS")
+    print(f"  {'='*60}")
+    print(f"  Detections received:     {len(detections_log)}")
+    print(f"  File uploaded:           {written_mb:.1f} / {file_size/(1024*1024):.1f} MB")
+
+    if first_det_time:
+        ttfd = first_det_time[0] - writer_start[0]
+        pct_at_first = bytes_written[0] * 100 / file_size
+        print(f"  Time to first detection: {ttfd:.3f}s")
+        if pct_at_first < 100:
+            print(f"  >>> STREAMING CONFIRMED: detections arrived while upload in progress")
+        else:
+            print(f"  >>> Detections arrived after full upload")
+    else:
+        print(f"  Time to first detection: (none — no detections in 10s)")
+
+    if inf_error:
+        print(f"  Inference error:         {inf_error[0]}")
+    print(f"  {'='*60}\n")
+
+    assert not inf_error, f"Inference error: {inf_error}"
+    assert len(detections_log) > 0, "no detections received in 10s"
diff --git a/tests/test_az178_streaming_video.py b/tests/test_az178_streaming_video.py
new file mode 100644
index 0000000..f3985e2
--- /dev/null
+++ b/tests/test_az178_streaming_video.py
@@ -0,0 +1,425 @@
+import asyncio
+import base64
+import json
+import os
+import tempfile
+import threading
+import time
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+class TestStreamingBuffer:
+    def test_sequential_write_read(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        try:
+            buf.append(b"hello")
+            buf.append(b" world")
+            buf.close_writer()
+            # Act
+            result = buf.read(-1)
+            # Assert
+            assert result == b"hello world"
+        finally:
+            buf.close()
+            os.unlink(buf.path)
+
+    def test_read_blocks_until_data_available(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        results = []
+
+        def writer():
+            time.sleep(0.1)
+            buf.append(b"data")
+            buf.close_writer()
+
+        t = threading.Thread(target=writer)
+        t.start()
+
+        # Act
+        results.append(buf.read(4))
+        t.join(timeout=5)
+
+        # Assert
+        assert results == [b"data"]
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_read_returns_empty_on_eof(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        buf.close_writer()
+
+        # Act
+        result = buf.read(1024)
+
+        # Assert
+        assert result == b""
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_concurrent_write_read_chunked(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        chunks_written = [b"aaa", b"bbb", b"ccc"]
+        read_data = bytearray()
+
+        def writer():
+            for c in chunks_written:
+                time.sleep(0.02)
+                buf.append(c)
+            buf.close_writer()
+
+        def reader():
+            while True:
+                chunk = buf.read(1024)
+                if not chunk:
+                    break
+                read_data.extend(chunk)
+
+        wt = threading.Thread(target=writer)
+        rt = threading.Thread(target=reader)
+
+        # Act
+        wt.start()
+        rt.start()
+        wt.join(timeout=5)
+        rt.join(timeout=5)
+
+        # Assert
+        assert bytes(read_data) == b"aaabbbccc"
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_seek_set_and_reread(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        buf.append(b"0123456789")
+        buf.close_writer()
+
+        # Act
+        buf.read(5)
+        buf.seek(2, 0)
+        result = buf.read(3)
+
+        # Assert
+        assert result == b"234"
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_seek_end_blocks_until_eof(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        positions = []
+
+        def writer():
+            time.sleep(0.1)
+            buf.append(b"abcdef")
+            buf.close_writer()
+
+        t = threading.Thread(target=writer)
+        t.start()
+
+        # Act
+        pos = buf.seek(0, 2)
+        positions.append(pos)
+        t.join(timeout=5)
+
+        # Assert
+        assert positions[0] == 6
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_tell_tracks_position(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        buf.append(b"data")
+        buf.close_writer()
+
+        # Assert
+        assert buf.tell() == 0
+        buf.read(2)
+        assert buf.tell() == 2
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_file_persisted_to_disk(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        payload = b"x" * 10000
+
+        # Act
+        buf.append(payload)
+        buf.close_writer()
+
+        # Assert
+        with open(buf.path, "rb") as f:
+            assert f.read() == payload
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_written_property(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        buf.append(b"abc")
+        buf.append(b"defgh")
+        buf.close_writer()
+
+        # Assert
+        assert buf.written == 8
+        buf.close()
+        os.unlink(buf.path)
+
+    def test_seekable_readable(self):
+        # Arrange
+        from streaming_buffer import StreamingBuffer
+
+        buf = StreamingBuffer()
+        buf.close_writer()
+
+        # Assert
+        assert buf.seekable() is True
+        assert buf.readable() is True
+        assert buf.writable() is False
+        buf.close()
+        os.unlink(buf.path)
+
+
+class TestMediaContentHashFromFile:
+    def test_small_file_matches_bytes_version(self):
+        # Arrange
+        from media_hash import compute_media_content_hash, compute_media_content_hash_from_file
+
+        data = b"hello world"
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(data)
+            path = f.name
+
+        # Act
+        hash_bytes = compute_media_content_hash(data)
+        hash_file = compute_media_content_hash_from_file(path)
+
+        # Assert
+        assert hash_file == hash_bytes
+        os.unlink(path)
+
+    def test_large_file_matches_bytes_version(self):
+        # Arrange
+        from media_hash import compute_media_content_hash, compute_media_content_hash_from_file
+
+        data = os.urandom(50_000)
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(data)
+            path = f.name
+
+        # Act
+        hash_bytes = compute_media_content_hash(data)
+        hash_file = compute_media_content_hash_from_file(path)
+
+        # Assert
+        assert hash_file == hash_bytes
+        os.unlink(path)
+
+    def test_virtual_flag(self):
+        # Arrange
+        from media_hash import compute_media_content_hash_from_file
+
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(b"test")
+            path = f.name
+
+        # Act
+        normal = compute_media_content_hash_from_file(path, virtual=False)
+        virtual = compute_media_content_hash_from_file(path, virtual=True)
+
+        # Assert
+        assert virtual == f"V{normal}"
+        os.unlink(path)
+
+    def test_exact_boundary_3072_bytes(self):
+        # Arrange
+        from media_hash import compute_media_content_hash, compute_media_content_hash_from_file
+
+        data = os.urandom(3072)
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            f.write(data)
+            path = f.name
+
+        # Act
+        hash_bytes = compute_media_content_hash(data)
+        hash_file = compute_media_content_hash_from_file(path)
+
+        # Assert
+        assert hash_file == hash_bytes
+        os.unlink(path)
+
+
+def _access_jwt(sub: str = "u1") -> str:
+    raw = json.dumps(
+        {"exp": int(time.time()) + 3600, "sub": sub}, separators=(",", ":")
+    ).encode()
+    payload = base64.urlsafe_b64encode(raw).decode().rstrip("=")
+    return f"h.{payload}.s"
+
+
+class _FakeInfStream:
+    is_engine_ready = True
+
+    def run_detect_video_stream(
+        self, readable, ai_cfg, media_name, on_annotation, status_callback=None
+    ):
+        while True:
+            chunk = readable.read(4096)
+            if not chunk:
+                break
+        if status_callback:
+            status_callback(media_name, 0)
+
+    def run_detect_video(self, *a, **kw):
+        pass
+
+    def run_detect_image(self, *a, **kw):
+        pass
+
+
+class TestDetectVideoEndpoint:
+    @pytest.fixture(autouse=True)
+    def reset_inference(self):
+        import main
+        main.inference = None
+        yield
+        main.inference = None
+
+    def test_streaming_upload_returns_started(self):
+        # Arrange
+        import main
+        from media_hash import compute_media_content_hash
+
+        video_body = b"fake-video-" * 200
+        content_hash = compute_media_content_hash(video_body)
+        mock_post = MagicMock()
+        mock_post.return_value.status_code = 201
+        mock_put = MagicMock()
+        mock_put.return_value.status_code = 204
+        token = _access_jwt()
+
+        with tempfile.TemporaryDirectory() as vd:
+            os.environ["VIDEOS_DIR"] = vd
+            from fastapi.testclient import TestClient
+            client = TestClient(main.app)
+            with (
+                patch.object(main, "get_inference", return_value=_FakeInfStream()),
+                patch.object(main.http_requests, "post", mock_post),
+                patch.object(main.http_requests, "put", mock_put),
+            ):
+                # Act
+                r = client.post(
+                    "/detect/video",
+                    content=video_body,
+                    headers={
+                        "X-Filename": "test.mp4",
+                        "Authorization": f"Bearer {token}",
+                    },
+                )
+            # Assert
+            assert r.status_code == 200
+            data = r.json()
+            assert data["status"] == "started"
+            assert data["mediaId"] == content_hash
+            stored = os.path.join(vd, f"{content_hash}.mp4")
+            assert os.path.isfile(stored)
+            with open(stored, "rb") as f:
+                assert f.read() == video_body
+
+    def test_non_auth_cleanup(self):
+        # Arrange
+        import main
+
+        video_body = b"noauth-vid-" * 100
+        with tempfile.TemporaryDirectory() as vd:
+            os.environ["VIDEOS_DIR"] = vd
+            from fastapi.testclient import TestClient
+            client = TestClient(main.app)
+            with patch.object(main, "get_inference", return_value=_FakeInfStream()):
+                # Act
+                r = client.post(
+                    "/detect/video",
+                    content=video_body,
+                    headers={"X-Filename": "test.mp4"},
+                )
+            # Assert
+            assert r.status_code == 200
+            assert r.json()["status"] == "started"
+
+    def test_rejects_non_video_extension(self):
+        # Arrange
+        import main
+
+        from fastapi.testclient import TestClient
+        client = TestClient(main.app)
+
+        # Act
+        r = client.post(
+            "/detect/video",
+            content=b"data",
+            headers={"X-Filename": "photo.jpg"},
+        )
+
+        # Assert
+        assert r.status_code == 400
+
+    def test_data_flows_through_streaming_buffer(self):
+        # Arrange
+        import main
+        from streaming_buffer import StreamingBuffer
+
+        received_chunks = []
+
+        class _CaptureInf(_FakeInfStream):
+            def run_detect_video_stream(
+                self, readable, ai_cfg, media_name, on_annotation, status_callback=None
+            ):
+                while True:
+                    chunk = readable.read(4096)
+                    if not chunk:
+                        break
+                    received_chunks.append(chunk)
+
+        video_body = b"A" * 10000
+        with tempfile.TemporaryDirectory() as vd:
+            os.environ["VIDEOS_DIR"] = vd
+            from fastapi.testclient import TestClient
+            client = TestClient(main.app)
+            with patch.object(main, "get_inference", return_value=_CaptureInf()):
+                # Act
+                r = client.post(
+                    "/detect/video",
+                    content=video_body,
+                    headers={"X-Filename": "v.mp4"},
+                )
+
+            # Assert
+            assert r.status_code == 200
+            all_received = b"".join(received_chunks)
+            assert all_received == video_body