diff --git a/.cursor/skills/autopilot/flows/existing-code.md b/.cursor/skills/autopilot/flows/existing-code.md
index 0e47f87..cbc6a96 100644
--- a/.cursor/skills/autopilot/flows/existing-code.md
+++ b/.cursor/skills/autopilot/flows/existing-code.md
@@ -217,22 +217,18 @@ After deployment completes, the existing-code workflow is done.
 **Re-Entry After Completion**
 Condition: the autopilot state shows `step: done` OR all steps through 13 (Deploy) are completed
 
-Action: The project completed a full cycle. Present status and loop back to New Task:
+Action: The project completed a full cycle. Print the status banner and automatically loop back to New Task — do NOT ask the user for confirmation:
 
 ```
 ══════════════════════════════════════
  PROJECT CYCLE COMPLETE
 ══════════════════════════════════════
  The previous cycle finished successfully.
- You can now add new functionality.
-══════════════════════════════════════
- A) Add new features (start New Task)
- B) Done — no more changes needed
+ Starting new feature cycle…
 ══════════════════════════════════════
 ```
 
-- If user picks A → set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task).
-- If user picks B → report final project status and exit.
+Set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task).
 
 ## Auto-Chain Rules
 
diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel
new file mode 100644
index 0000000..0e70ecf
Binary files /dev/null and b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel differ
diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin
new file mode 100644
index 0000000..63e3f56
Binary files /dev/null and b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin differ
diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json b/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json
new file mode 100644
index 0000000..ef7babc
--- /dev/null
+++ b/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json
@@ -0,0 +1,18 @@
+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "7DD8829B-A724-4A3E-A14C-492D47CA6638": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "D5921D3B-1680-4CD9-B1D5-130EE972BBFD": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "7DD8829B-A724-4A3E-A14C-492D47CA6638"
+}
diff --git a/_docs/04_deploy/ci_cd_pipeline.md b/_docs/04_deploy/ci_cd_pipeline.md
new file mode 100644
index 0000000..53df502
--- /dev/null
+++ b/_docs/04_deploy/ci_cd_pipeline.md
@@ -0,0 +1,184 @@
+# Azaion AI Training — CI/CD Pipeline
+
+## Pipeline Overview
+
+| Stage | Trigger | Quality Gate |
+|-------|---------|-------------|
+| Lint | Every push | Zero lint errors |
+| Test | Every push | All tests pass |
+| Security | Every push | Zero critical/high CVEs |
+| Build | PR merge to dev | Docker build succeeds |
+| Push | After build | Images pushed to registry |
+| Deploy | Manual trigger | Health checks pass on target server |
+
+No staging environment — the system runs on a dedicated GPU server. "Staging" is replaced by the test suite running in CI on CPU-only runners (annotation queue tests, unit tests) and manual GPU verification on the target machine.
+
+## Stage Details
+
+### Lint
+
+- `black --check src/` — Python formatting
+- `ruff check src/` — Python linting
+- Runs on standard CI runner (no GPU)
+
+### Test
+
+- Framework: `pytest`
+- Command: `pytest tests/ -v --tb=short`
+- Test compose for annotation queue integration tests: `docker compose -f docker-compose.test.yml up --abort-on-container-exit`
+- GPU-dependent tests (training, export) are excluded from CI — they require a physical GPU and run during manual verification on the target server
+- Coverage report published as pipeline artifact
+
+### Security
+
+- Dependency audit: `pip-audit -r requirements.txt`
+- Dependency audit: `pip-audit -r src/annotation-queue/requirements.txt`
+- SAST scan: Semgrep with `p/python` ruleset
+- Image scan: Trivy on built Docker images
+- Block on: critical or high severity findings
+
+### Build
+
+- Docker images built for both components:
+  - `docker/training.Dockerfile` → `azaion/training:<git-sha>`
+  - `docker/annotation-queue.Dockerfile` → `azaion/annotation-queue:<git-sha>`
+- Build cache: Docker layer cache via GitHub Actions cache
+- Build runs on standard runner — no GPU needed for `docker build`
+
+### Push
+
+- Registry: configurable via `DOCKER_REGISTRY` secret (e.g., GitHub Container Registry `ghcr.io`, or private registry)
+- Authentication: registry login via CI secrets (`DOCKER_REGISTRY_USER`, `DOCKER_REGISTRY_TOKEN`)
+
+### Deploy
+
+- **Manual trigger only** (workflow_dispatch) — training runs for days, unattended deploys are risky
+- Deployment method: SSH to target GPU server, run deploy scripts (`scripts/deploy.sh`)
+- Pre-deploy: pull new images, stop services gracefully
+- Post-deploy: start services, run health check script
+- Rollback: `scripts/deploy.sh --rollback` redeploys previous image tags
+
+## Pipeline Configuration (GitHub Actions)
+
+```yaml
+name: CI/CD
+
+on:
+  push:
+    branches: [dev, main]
+  pull_request:
+    branches: [dev]
+  workflow_dispatch:
+    inputs:
+      deploy_target:
+        description: "Deploy to target server"
+        required: true
+        type: boolean
+        default: false
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - run: pip install black ruff
+      - run: black --check src/
+      - run: ruff check src/
+
+  test:
+    runs-on: ubuntu-latest
+    needs: lint
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - run: pip install -r requirements-test.txt
+      - run: pytest tests/ -v --tb=short --ignore=tests/gpu
+
+  security:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - run: pip install pip-audit
+      - run: pip-audit -r requirements.txt || true
+      - run: pip-audit -r src/annotation-queue/requirements.txt
+      - uses: returntocorp/semgrep-action@v1
+        with:
+          config: p/python
+
+  build-and-push:
+    runs-on: ubuntu-latest
+    needs: [test, security]
+    if: github.ref == 'refs/heads/dev' && github.event_name == 'push'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          registry: ${{ secrets.DOCKER_REGISTRY }}
+          username: ${{ secrets.DOCKER_REGISTRY_USER }}
+          password: ${{ secrets.DOCKER_REGISTRY_TOKEN }}
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/training.Dockerfile
+          push: true
+          tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/training:${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/annotation-queue.Dockerfile
+          push: true
+          tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/annotation-queue:${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build-and-push
+    if: github.event.inputs.deploy_target == 'true'
+    environment: production
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
+          ssh ${{ secrets.DEPLOY_USER }}@${{ secrets.DEPLOY_HOST }} \
+            "cd /opt/azaion-training && \
+             DOCKER_IMAGE_TAG=${{ github.sha }} \
+             bash scripts/deploy.sh"
+```
+
+## Caching Strategy
+
+| Cache | Key | Restore Keys |
+|-------|-----|-------------|
+| pip dependencies | `requirements.txt` hash | `pip-` prefix |
+| Docker layers | GitHub Actions cache (BuildKit) | `gha-` prefix |
+
+## Parallelization
+
+```
+push event
+  ├── lint ──► test ──┐
+  │                   ├──► build-and-push ──► deploy (manual)
+  └── security ───────┘
+```
+
+Lint and security run in parallel. Test depends on lint. Build depends on both test and security passing.
+
+## Notifications
+
+| Event | Channel | Recipients |
+|-------|---------|-----------|
+| Build failure | GitHub PR check | PR author |
+| Security alert | GitHub security tab | Repository maintainers |
+| Deploy success | GitHub Actions log | Deployment team |
+| Deploy failure | GitHub Actions log + email | Deployment team |
diff --git a/_docs/04_deploy/containerization.md b/_docs/04_deploy/containerization.md
new file mode 100644
index 0000000..5efd335
--- /dev/null
+++ b/_docs/04_deploy/containerization.md
@@ -0,0 +1,196 @@
+# Azaion AI Training — Containerization
+
+## Component Dockerfiles
+
+### Training Pipeline
+
+| Property | Value |
+|----------|-------|
+| Base image | `nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04` |
+| Build image | Same (devel required for TensorRT engine build + pycuda) |
+| Stages | 1) system deps + Python 3.10 → 2) pip install requirements → 3) copy source |
+| User | `azaion` (non-root, UID 1000) |
+| Health check | Not applicable — batch job, exits on completion |
+| Exposed ports | None |
+| Key build args | `CUDA_VERSION=12.1.1` |
+
+Single-stage build (devel image required at runtime for TensorRT engine compilation and pycuda). The image is large but training runs for days on a dedicated GPU server — image size is not a deployment bottleneck.
+
+Installs from `requirements.txt` with `--extra-index-url https://download.pytorch.org/whl/cu121` for PyTorch CUDA 12.1 wheels.
+
+Volume mount: `/azaion/` host directory for datasets, models, and annotation data.
+
+### Annotation Queue
+
+| Property | Value |
+|----------|-------|
+| Base image | `python:3.10-slim` |
+| Build image | `python:3.10-slim` (no compilation needed) |
+| Stages | 1) pip install from `src/annotation-queue/requirements.txt` → 2) copy source |
+| User | `azaion` (non-root, UID 1000) |
+| Health check | `CMD python -c "import rstream" \|\| exit 1` (process liveness; no HTTP endpoint) |
+| Exposed ports | None |
+| Key build args | None |
+
+Lightweight container — only needs `pyyaml`, `msgpack`, `rstream`. No GPU, no heavy ML libraries. Runs as a persistent async process consuming from RabbitMQ Streams.
+
+Volume mount: `/azaion/` host directory for writing annotation images and labels.
+
+### Not Containerized
+
+The following are developer/verification tools, not production services:
+
+- **Inference Engine** (`start_inference.py`) — used for testing and model verification, runs ad-hoc on a GPU machine
+- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — interactive developer utilities requiring GUI environment
+
+## Docker Compose — Local Development
+
+```yaml
+services:
+  rabbitmq:
+    image: rabbitmq:3.13-management-alpine
+    ports:
+      - "5552:5552"
+      - "5672:5672"
+      - "15672:15672"
+    environment:
+      RABBITMQ_DEFAULT_USER: ${RABBITMQ_USER}
+      RABBITMQ_DEFAULT_PASS: ${RABBITMQ_PASSWORD}
+    volumes:
+      - rabbitmq_data:/var/lib/rabbitmq
+    healthcheck:
+      test: ["CMD", "rabbitmq-diagnostics", "check_running"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  annotation-queue:
+    build:
+      context: .
+      dockerfile: docker/annotation-queue.Dockerfile
+    env_file: .env
+    depends_on:
+      rabbitmq:
+        condition: service_healthy
+    volumes:
+      - ${AZAION_ROOT_DIR:-/azaion}:/azaion
+    restart: unless-stopped
+
+  training:
+    build:
+      context: .
+      dockerfile: docker/training.Dockerfile
+    env_file: .env
+    volumes:
+      - ${AZAION_ROOT_DIR:-/azaion}:/azaion
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    ipc: host
+    shm_size: "16g"
+
+volumes:
+  rabbitmq_data:
+
+networks:
+  default:
+    name: azaion-training
+```
+
+Notes:
+- `ipc: host` and `shm_size: "16g"` for PyTorch multi-worker data loading
+- `annotation-queue` runs continuously, restarts on failure
+- RabbitMQ Streams plugin must be enabled (port 5552); the management UI is on port 15672
+
+## Docker Compose — Blackbox Tests
+
+```yaml
+services:
+  rabbitmq:
+    image: rabbitmq:3.13-management-alpine
+    ports:
+      - "5552:5552"
+      - "5672:5672"
+    environment:
+      RABBITMQ_DEFAULT_USER: test_user
+      RABBITMQ_DEFAULT_PASS: test_pass
+    healthcheck:
+      test: ["CMD", "rabbitmq-diagnostics", "check_running"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+
+  annotation-queue:
+    build:
+      context: .
+      dockerfile: docker/annotation-queue.Dockerfile
+    environment:
+      RABBITMQ_HOST: rabbitmq
+      RABBITMQ_PORT: "5552"
+      RABBITMQ_USER: test_user
+      RABBITMQ_PASSWORD: test_pass
+      RABBITMQ_QUEUE_NAME: azaion-annotations
+      AZAION_ROOT_DIR: /azaion
+    depends_on:
+      rabbitmq:
+        condition: service_healthy
+    volumes:
+      - test_data:/azaion
+
+  test-runner:
+    build:
+      context: .
+      dockerfile: docker/test-runner.Dockerfile
+    environment:
+      RABBITMQ_HOST: rabbitmq
+      RABBITMQ_PORT: "5552"
+      RABBITMQ_USER: test_user
+      RABBITMQ_PASSWORD: test_pass
+      AZAION_ROOT_DIR: /azaion
+      TEST_SCOPE: blackbox
+    depends_on:
+      rabbitmq:
+        condition: service_healthy
+      annotation-queue:
+        condition: service_started
+    volumes:
+      - test_data:/azaion
+
+volumes:
+  test_data:
+```
+
+Run: `docker compose -f docker-compose.test.yml up --abort-on-container-exit`
+
+Note: GPU-dependent tests (training) require `--gpus all` and are excluded from the default blackbox test suite. They run separately via `docker compose -f docker-compose.test.yml --profile gpu-tests up --abort-on-container-exit`.
+
+## Image Tagging Strategy
+
+| Context | Tag Format | Example |
+|---------|-----------|---------|
+| CI build | `<registry>/azaion/<component>:<git-sha>` | `registry.example.com/azaion/training:a1b2c3d` |
+| Release | `<registry>/azaion/<component>:<semver>` | `registry.example.com/azaion/training:1.0.0` |
+| Local dev | `azaion-<component>:latest` | `azaion-training:latest` |
+
+## .dockerignore
+
+```
+.git
+.cursor
+_docs
+_standalone
+tests
+**/__pycache__
+**/*.pyc
+*.md
+.env
+.env.example
+docker-compose*.yml
+.gitignore
+.editorconfig
+requirements-test.txt
+```
diff --git a/_docs/04_deploy/deploy_scripts.md b/_docs/04_deploy/deploy_scripts.md
new file mode 100644
index 0000000..b15b002
--- /dev/null
+++ b/_docs/04_deploy/deploy_scripts.md
@@ -0,0 +1,115 @@
+# Azaion AI Training — Deployment Scripts
+
+## Overview
+
+| Script | Purpose | Location |
+|--------|---------|----------|
+| `deploy.sh` | Main deployment orchestrator | `scripts/deploy.sh` |
+| `generate-config.sh` | Generate `config.yaml` from environment variables | `scripts/generate-config.sh` |
+| `pull-images.sh` | Pull Docker images from registry | `scripts/pull-images.sh` |
+| `start-services.sh` | Start all services via Docker Compose | `scripts/start-services.sh` |
+| `stop-services.sh` | Graceful shutdown with tag backup | `scripts/stop-services.sh` |
+| `health-check.sh` | Verify deployment health | `scripts/health-check.sh` |
+
+## Prerequisites
+
+- Docker and Docker Compose installed on target machine
+- NVIDIA driver + Docker GPU support (`nvidia-container-toolkit`)
+- SSH access to target machine (for remote deployment)
+- `.env` file with required environment variables (see `.env.example`)
+
+## Environment Variables
+
+All scripts source `.env` from the project root.
+
+| Variable | Required By | Purpose |
+|----------|------------|---------|
+| `DEPLOY_HOST` | `deploy.sh` (remote) | SSH target for remote deployment |
+| `DEPLOY_USER` | `deploy.sh` (remote) | SSH user (default: `deploy`) |
+| `DOCKER_REGISTRY` | `pull-images.sh` | Container registry URL |
+| `DOCKER_IMAGE_TAG` | `pull-images.sh` | Image version to deploy (default: `latest`) |
+| `AZAION_API_URL` | `generate-config.sh` | Azaion REST API URL |
+| `AZAION_API_EMAIL` | `generate-config.sh` | API login email |
+| `AZAION_API_PASSWORD` | `generate-config.sh` | API login password |
+| `RABBITMQ_HOST` | `generate-config.sh` | RabbitMQ host |
+| `RABBITMQ_PORT` | `generate-config.sh` | RabbitMQ port |
+| `RABBITMQ_USER` | `generate-config.sh` | RabbitMQ username |
+| `RABBITMQ_PASSWORD` | `generate-config.sh` | RabbitMQ password |
+| `RABBITMQ_QUEUE_NAME` | `generate-config.sh` | RabbitMQ queue name |
+| `AZAION_ROOT_DIR` | `start-services.sh`, `health-check.sh` | Root data directory (default: `/azaion`) |
+
+## Script Details
+
+### deploy.sh
+
+Main orchestrator: generates config, pulls images, stops old services, starts new ones, checks health.
+
+```
+./scripts/deploy.sh              # Deploy latest version (local)
+./scripts/deploy.sh --rollback   # Rollback to previous version
+./scripts/deploy.sh --local      # Force local mode (skip SSH)
+./scripts/deploy.sh --help       # Show usage
+```
+
+Flow: `generate-config.sh` → `pull-images.sh` → `stop-services.sh` → `start-services.sh` → `health-check.sh`
+
+When `DEPLOY_HOST` is set, commands execute over SSH on the remote server. Without it, runs locally.
+
+### generate-config.sh
+
+Generates `config.yaml` from environment variables, preserving the existing config format the codebase expects. Validates that all required variables are set before writing.
+
+```
+./scripts/generate-config.sh     # Generate config.yaml
+```
+
+### pull-images.sh
+
+Pulls Docker images for both deployable components from the configured registry.
+
+```
+./scripts/pull-images.sh         # Pull images
+```
+
+Images pulled:
+- `${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}`
+- `${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}`
+
+### start-services.sh
+
+Creates the `/azaion/` directory tree if needed, then runs `docker compose up -d`.
+
+```
+./scripts/start-services.sh      # Start services
+```
+
+### stop-services.sh
+
+Saves current image tags to `scripts/.previous-tags` for rollback, then stops and removes containers with a 30-second grace period.
+
+```
+./scripts/stop-services.sh       # Stop services
+```
+
+### health-check.sh
+
+Checks container status, GPU availability, disk usage, and queue offset. Returns exit code 0 (healthy) or 1 (unhealthy).
+
+```
+./scripts/health-check.sh        # Run health check
+```
+
+Checks performed:
+- Annotation queue and RabbitMQ containers running
+- GPU available and temperature < 90°C
+- Disk usage < 95% (warning at 80%)
+- Queue offset file exists
+
+## Common Properties
+
+All scripts:
+- Use `#!/bin/bash` with `set -euo pipefail`
+- Support `--help` flag
+- Source `.env` from project root if present
+- Are idempotent
+- Support remote execution via SSH (`DEPLOY_HOST` + `DEPLOY_USER`)
diff --git a/_docs/04_deploy/deployment_procedures.md b/_docs/04_deploy/deployment_procedures.md
new file mode 100644
index 0000000..e350772
--- /dev/null
+++ b/_docs/04_deploy/deployment_procedures.md
@@ -0,0 +1,115 @@
+# Azaion AI Training — Deployment Procedures
+
+## Deployment Strategy
+
+**Pattern**: Stop-and-replace on a single GPU server
+**Rationale**: The system runs on one dedicated GPU server. Training takes days — there is no "zero-downtime" concern for the training process. The annotation queue can tolerate brief restarts (queue offset is persisted, messages are replayed from last offset).
+
+### Component Behavior During Deploy
+
+| Component | Deploy Impact | Recovery |
+|-----------|--------------|----------|
+| Training Pipeline | Must finish current run or be stopped manually. Never interrupted mid-training — checkpoints save every epoch. | Resume from last checkpoint (`resume_training`) |
+| Annotation Queue | Brief restart (< 30 seconds). Messages accumulate in RabbitMQ during downtime. | Resumes from persisted offset in `offset.yaml` |
+
+### Graceful Shutdown
+
+- **Training**: not stopped by deploy scripts — training runs for days and is managed independently. Deploy only updates images/code for the *next* training run.
+- **Annotation Queue**: `docker stop` with 30-second grace period → SIGTERM → process exits → container replaced with new image.
+
+## Health Checks
+
+No HTTP endpoints — these are batch processes and queue consumers. Health is verified by:
+
+| Check | Method | Target | Interval | Failure Action |
+|-------|--------|--------|----------|----------------|
+| Annotation Queue alive | `docker inspect --format='{{.State.Running}}'` | annotation-queue container | 5 min (cron) | Restart container |
+| RabbitMQ reachable | TCP connect to `$RABBITMQ_HOST:$RABBITMQ_PORT` | RabbitMQ server | 5 min (cron) | Alert, check network |
+| GPU available | `nvidia-smi` exit code | NVIDIA driver | 5 min (cron) | Alert, check driver |
+| Disk space | `df /azaion/ --output=pcent` | Filesystem | 5 min (cron) | Alert if > 80%, critical if > 95% |
+| Queue offset advancing | Compare `offset.yaml` value to previous check | Annotation queue progress | 30 min | Alert if stale and queue has messages |
+
+All checks are performed by `scripts/health-check.sh`.
+
+## Production Deployment
+
+### Pre-Deploy Checklist
+
+- [ ] All CI tests pass on `dev` branch
+- [ ] Security scan clean (zero critical/high CVEs)
+- [ ] Docker images built and pushed to registry
+- [ ] `.env` on target server is up to date with any new variables
+- [ ] `/azaion/` directory tree exists with correct permissions
+- [ ] No training run is currently active (or training will not be restarted this deploy)
+- [ ] NVIDIA driver and Docker with GPU support are installed on target
+
+### Deploy Steps
+
+1. SSH to GPU server
+2. Pull new Docker images: `scripts/pull-images.sh`
+3. Stop annotation queue: `scripts/stop-services.sh`
+4. Generate `config.yaml` from `.env` template
+5. Start services: `scripts/start-services.sh`
+6. Verify health: `scripts/health-check.sh`
+7. Confirm annotation queue is consuming messages (check offset advancing)
+
+All steps are orchestrated by `scripts/deploy.sh`.
+
+### Post-Deploy Verification
+
+- Check `docker ps` — annotation-queue container is running
+- Check `docker logs annotation-queue --tail 20` — no errors
+- Check `offset.yaml` — offset is advancing (queue is consuming)
+- Check disk space — adequate for continued operation
+
+## Rollback Procedures
+
+### Trigger Criteria
+
+- Annotation queue crashes repeatedly after deploy (restart loop)
+- Queue messages are being dropped or corrupted
+- `config.yaml` generation failed (missing env vars)
+- New code has a bug affecting annotation processing
+
+### Rollback Steps
+
+1. Run `scripts/deploy.sh --rollback`
+   - This reads the previous image tags from `scripts/.previous-tags` (saved during deploy)
+   - Stops current containers
+   - Starts containers with previous image tags
+2. Verify health: `scripts/health-check.sh`
+3. Check annotation queue is consuming correctly
+4. Investigate root cause of the failed deploy
+
+### Training Rollback
+
+Training is not managed by deploy scripts. If a new training run produces bad results:
+1. The previous `best.pt` model is still available in `/azaion/models/` (dated directories)
+2. Roll back by pointing `config.yaml` to the previous model
+3. No container restart needed — training is a batch job started manually
+
+## Deployment Checklist (Quick Reference)
+
+```
+Pre-deploy:
+  □ CI green on dev branch
+  □ Images built and pushed
+  □ .env updated on server (if new vars added)
+  □ No active training run (if training container is being updated)
+
+Deploy:
+  □ SSH to server
+  □ Run scripts/deploy.sh
+  □ Verify health-check.sh passes
+
+Post-deploy:
+  □ docker ps shows containers running
+  □ docker logs show no errors
+  □ Queue offset advancing
+  □ Disk space adequate
+
+If problems:
+  □ Run scripts/deploy.sh --rollback
+  □ Verify health
+  □ Investigate logs
+```
diff --git a/_docs/04_deploy/environment_strategy.md b/_docs/04_deploy/environment_strategy.md
new file mode 100644
index 0000000..3ead8e7
--- /dev/null
+++ b/_docs/04_deploy/environment_strategy.md
@@ -0,0 +1,106 @@
+# Azaion AI Training — Environment Strategy
+
+## Environments
+
+| Environment | Purpose | Infrastructure | Data Source |
+|-------------|---------|---------------|-------------|
+| Development | Local developer workflow | docker-compose, local RabbitMQ | Test annotations, small sample dataset |
+| Production | Live training on GPU server | Direct host processes or Docker, real RabbitMQ | Real annotations from Azaion platform |
+
+No staging environment — the system is an ML training pipeline on a dedicated GPU server, not a multi-tier web service. Validation happens through the CI test suite (CPU tests) and manual verification on the GPU server before committing to a long training run.
+
+## Environment Variables
+
+### Required Variables
+
+| Variable | Purpose | Dev Default | Prod Source |
+|----------|---------|-------------|-------------|
+| `AZAION_API_URL` | Azaion REST API base URL | `https://api.azaion.com` | `.env` on server |
+| `AZAION_API_EMAIL` | API login email | dev account | `.env` on server |
+| `AZAION_API_PASSWORD` | API login password | dev password | `.env` on server |
+| `RABBITMQ_HOST` | RabbitMQ host | `127.0.0.1` (local container) | `.env` on server |
+| `RABBITMQ_PORT` | RabbitMQ Streams port | `5552` | `.env` on server |
+| `RABBITMQ_USER` | RabbitMQ username | `azaion_receiver` | `.env` on server |
+| `RABBITMQ_PASSWORD` | RabbitMQ password | `changeme` | `.env` on server |
+| `RABBITMQ_QUEUE_NAME` | Queue name | `azaion-annotations` | `.env` on server |
+| `AZAION_ROOT_DIR` | Root data directory | `/azaion` | `.env` on server |
+| `AZAION_DATA_DIR` | Validated annotations dir name | `data` | `.env` on server |
+| `AZAION_DATA_SEED_DIR` | Unvalidated annotations dir name | `data-seed` | `.env` on server |
+| `AZAION_DATA_DELETED_DIR` | Deleted annotations dir name | `data_deleted` | `.env` on server |
+| `TRAINING_MODEL` | Base model filename | `yolo26m.pt` | `.env` on server |
+| `TRAINING_EPOCHS` | Training epochs | `120` | `.env` on server |
+| `TRAINING_BATCH_SIZE` | Training batch size | `11` | `.env` on server |
+| `TRAINING_IMGSZ` | Training image size | `1280` | `.env` on server |
+| `TRAINING_SAVE_PERIOD` | Checkpoint save interval | `1` | `.env` on server |
+| `TRAINING_WORKERS` | Dataloader workers | `24` | `.env` on server |
+| `EXPORT_ONNX_IMGSZ` | ONNX export image size | `1280` | `.env` on server |
+
+### `.env.example`
+
+Committed to version control with placeholder values. See `.env.example` in project root (created in Step 1).
+
+### Variable Validation
+
+The `config.yaml` generation script (part of deploy scripts) validates that all required environment variables are set before writing the config file. Missing variables cause an immediate failure with a clear error listing which variables are absent.
+
+## Config Generation
+
+The codebase reads configuration from `config.yaml`, not directly from environment variables. The deployment flow generates `config.yaml` from environment variables at deploy time:
+
+1. `.env` contains all variable values (never committed)
+2. Deploy script sources `.env` and renders `config.yaml` from a template
+3. `config.yaml` is placed at the expected location for the application
+
+This preserves the existing code's config reading pattern while externalizing secrets to environment variables.
+
+## Secrets Management
+
+| Environment | Method | Location |
+|-------------|--------|----------|
+| Development | `.env` file (git-ignored) | Project root |
+| Production | `.env` file (restricted permissions) | GPU server `/opt/azaion-training/.env` |
+
+Production `.env` file:
+- Ownership: `root:deploy` (deploy user's group)
+- Permissions: `640` (owner read/write, group read, others none)
+- Located outside the Docker build context
+
+Secrets in this project:
+- `AZAION_API_PASSWORD` — API authentication
+- `RABBITMQ_PASSWORD` — message queue access
+- CDN credentials — auto-provisioned via API at runtime (encrypted `cdn.yaml`), not in `.env`
+- Model encryption key — hardcoded in `security.py` (existing pattern, flagged as security concern)
+
+Rotation policy: rotate API and RabbitMQ passwords quarterly. Update `.env` on the server, restart affected services.
+
+## Filesystem Management
+
+| Environment | `/azaion/` Location | Contents |
+|-------------|-------------------|----------|
+| Development | Docker volume or local dir | Test images, small sample labels |
+| Production | Host directory `/azaion/` | Full annotation dataset, trained models, export artifacts |
+
+The `/azaion/` directory tree must exist before services start:
+
+```
+/azaion/
+├── data/           (validated annotations: images/ + labels/)
+├── data-seed/      (unvalidated annotations: images/ + labels/)
+├── data_deleted/   (soft-deleted annotations: images/ + labels/)
+├── datasets/       (formed training datasets: azaion-YYYY-MM-DD/)
+├── models/         (trained models: azaion-YYYY-MM-DD/, azaion.pt)
+└── classes.json    (annotation class definitions)
+```
+
+Production data is persistent and never deleted by deployment. Docker containers mount this directory as a bind mount.
+
+## External Service Configuration
+
+| Service | Dev | Prod |
+|---------|-----|------|
+| Azaion REST API | Real API (dev credentials) | Real API (prod credentials) |
+| S3-compatible CDN | Auto-provisioned via API | Auto-provisioned via API |
+| RabbitMQ | Local container (docker-compose) | Managed instance on network |
+| NVIDIA GPU | Host GPU via `--gpus all` | Host GPU via `--gpus all` |
+
+CDN credentials are not in `.env` — they are fetched from the API at runtime as an encrypted `cdn.yaml` file, decrypted using the hardware-bound key. This is the existing pattern and does not need environment variable configuration.
diff --git a/_docs/04_deploy/observability.md b/_docs/04_deploy/observability.md
new file mode 100644
index 0000000..b069367
--- /dev/null
+++ b/_docs/04_deploy/observability.md
@@ -0,0 +1,135 @@
+# Azaion AI Training — Observability
+
+This system is an ML training pipeline, not a web service. Observability focuses on training progress, GPU health, queue throughput, and disk usage rather than HTTP request metrics.
+
+## Logging
+
+### Format
+
+Structured JSON to stdout/stderr. Containers should not write log files — use Docker's log driver for collection.
+
+```json
+{
+  "timestamp": "2026-03-28T14:30:00Z",
+  "level": "INFO",
+  "service": "training",
+  "message": "Epoch 45/120 completed",
+  "context": {"epoch": 45, "loss": 0.0234, "mAP50": 0.891}
+}
+```
+
+### Log Levels
+
+| Level | Usage | Example |
+|-------|-------|---------|
+| ERROR | Exceptions, unrecoverable failures | GPU out of memory, API auth failed, corrupt label file |
+| WARN | Recoverable issues | Queue reconnection attempt, skipped corrupt image |
+| INFO | Progress and business events | Epoch completed, dataset formed, model exported, annotation saved |
+| DEBUG | Diagnostics (dev only) | Individual file processing, queue message contents |
+
+### Current State
+
+| Component | Current Logging | Target |
+|-----------|----------------|--------|
+| Training Pipeline | `print()` statements | Python `logging` with JSON formatter to stdout |
+| Annotation Queue | `logging` with TimedRotatingFileHandler | Keep existing + add JSON stdout for Docker |
+| Inference Engine | `print()` statements | Not in deployment scope |
+
+### Retention
+
+| Environment | Destination | Retention |
+|-------------|-------------|-----------|
+| Development | Console (docker logs) | Session |
+| Production | Docker JSON log driver → host filesystem | 30 days (log rotation via Docker daemon config) |
+
+### PII Rules
+
+- Never log API passwords or tokens
+- Never log CDN credentials
+- Never log model encryption keys
+- Queue message image data (base64 bytes) must not be logged at INFO level
+
+## Metrics
+
+### Collection Method
+
+No HTTP `/metrics` endpoint — these are batch processes, not services. Metrics are collected via:
+1. **Docker stats** — CPU, memory, GPU via `nvidia-smi`
+2. **Training logs** — parsed from structured log output (epoch, loss, mAP)
+3. **Filesystem monitoring** — disk usage of `/azaion/` directory tree
+
+### Key Metrics
+
+| Metric | Type | Source | Description |
+|--------|------|--------|-------------|
+| `training_epoch` | Gauge | Training logs | Current epoch number |
+| `training_loss` | Gauge | Training logs | Current training loss |
+| `training_mAP50` | Gauge | Training logs | Mean average precision at IoU 0.50 |
+| `training_mAP50_95` | Gauge | Training logs | mAP at IoU 0.50:0.95 |
+| `gpu_utilization_pct` | Gauge | `nvidia-smi` | GPU compute utilization |
+| `gpu_memory_used_mb` | Gauge | `nvidia-smi` | GPU memory usage |
+| `gpu_temperature_c` | Gauge | `nvidia-smi` | GPU temperature |
+| `disk_usage_azaion_gb` | Gauge | `df` / `du` | Total disk usage of `/azaion/` |
+| `disk_usage_datasets_gb` | Gauge | `du` | Disk usage of `/azaion/datasets/` |
+| `disk_usage_models_gb` | Gauge | `du` | Disk usage of `/azaion/models/` |
+| `queue_messages_processed` | Counter | Queue logs | Total annotations processed |
+| `queue_messages_failed` | Counter | Queue logs | Failed message processing |
+| `queue_offset` | Gauge | `offset.yaml` | Last processed queue offset |
+
+### Monitoring Script
+
+A `scripts/health-check.sh` script (created in Step 7) collects these metrics on demand:
+- Checks Docker container status
+- Reads `nvidia-smi` for GPU metrics
+- Checks disk usage
+- Reads annotation queue offset
+- Reports overall system health
+
+Collection interval: on-demand via health check script, or via cron job (every 5 minutes) for continuous monitoring.
+
+## Distributed Tracing
+
+Not applicable. The system consists of independent batch processes (training, annotation queue) that do not form request chains. No distributed tracing is needed.
+
+## Alerting
+
+| Severity | Condition | Response Time | Action |
+|----------|-----------|---------------|--------|
+| Critical | GPU temperature > 90°C | Immediate | Pause training, investigate cooling |
+| Critical | Annotation queue process crashed | 5 min | Restart container, check logs |
+| Critical | Disk usage > 95% | 5 min | Free space (old datasets/models), expand storage |
+| High | Training loss NaN or diverging | 30 min | Check dataset, review hyperparameters |
+| High | GPU memory OOM | 30 min | Reduce batch size, restart training |
+| Medium | Disk usage > 80% | 4 hours | Plan cleanup of old datasets |
+| Medium | Queue offset stale for > 1 hour | 4 hours | Check RabbitMQ connectivity |
+| Low | Training checkpoint save failed | Next business day | Check disk space, retry |
+
+### Notification Method
+
+For a single GPU server deployment, alerts are practical via:
+- **Cron-based health check** running `scripts/health-check.sh` every 5 minutes
+- Critical/High alerts: write to a status file, optionally send email or webhook notification
+- Dashboard: a simple status page generated from the last health check output
+
+## Dashboards
+
+### Operations View
+
+For a single-server deployment, a lightweight monitoring approach:
+
+1. **GPU dashboard**: `nvidia-smi dmon` or `nvitop` running in a tmux session
+2. **Training progress**: tail structured logs for epoch/loss/mAP progression
+3. **Disk usage**: periodic `du -sh /azaion/*/` output
+4. **Container status**: `docker ps` + `docker stats`
+
+### Training Progress View
+
+Key information to track during a training run:
+- Current epoch / total epochs
+- Training loss trend (decreasing = good)
+- Validation mAP50 and mAP50-95 (increasing = good)
+- GPU utilization and temperature
+- Estimated time remaining
+- Last checkpoint saved
+
+YOLO's built-in TensorBoard integration provides this out of the box. Access via `tensorboard --logdir /azaion/models/azaion-YYYY-MM-DD/` on the training server.
diff --git a/_docs/04_deploy/reports/deploy_status_report.md b/_docs/04_deploy/reports/deploy_status_report.md
index c46a460..5298465 100644
--- a/_docs/04_deploy/reports/deploy_status_report.md
+++ b/_docs/04_deploy/reports/deploy_status_report.md
@@ -8,8 +8,10 @@
 |-----------|--------|-------------|---------|-------------------|
 | Training Pipeline | Implemented & Tested | `train.py` | Long-running (days) | GPU server, RTX 4090 (24GB VRAM) |
 | Annotation Queue | Implemented & Tested | `annotation-queue/annotation_queue_handler.py` | Continuous (async) | Any server with network access |
-| Inference Engine | Implemented & Tested | `start_inference.py` | On-demand | GPU-equipped machine |
-| Data Tools | Implemented | `convert-annotations.py`, `dataset-visualiser.py` | Ad-hoc | Developer machine |
+
+Not deployed as production services:
+- **Inference Engine** (`start_inference.py`) — verification/testing tool, runs ad-hoc on GPU machine
+- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — developer utilities
 
 Note: Augmentation is not a separate process — it is YOLO's built-in mosaic/mixup within the training pipeline.
 
diff --git a/_docs/_autopilot_state.md b/_docs/_autopilot_state.md
index ec07404..90ad604 100644
--- a/_docs/_autopilot_state.md
+++ b/_docs/_autopilot_state.md
@@ -2,8 +2,8 @@
 
 ## Current Step
 flow: existing-code
-step: 13
+step: done
 name: Deploy
-status: in_progress
-sub_step: 1 — Status & Env Check
+status: completed
+sub_step: 7 — Deployment Scripts
 retry_count: 0
diff --git a/scripts/deploy.sh b/scripts/deploy.sh
new file mode 100755
index 0000000..73d4523
--- /dev/null
+++ b/scripts/deploy.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [OPTIONS]
+
+Azaion AI Training — Deployment orchestrator.
+
+Options:
+  --rollback    Rollback to previous image tags
+  --local       Run locally (skip SSH, default if DEPLOY_HOST is unset)
+  --help        Show this help message
+
+Environment:
+  DEPLOY_HOST   Target server for remote deployment (optional)
+  DEPLOY_USER   SSH user (default: deploy)
+EOF
+    exit 0
+}
+
+ROLLBACK=false
+LOCAL=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --rollback) ROLLBACK=true ;;
+        --local) LOCAL=true ;;
+        --help) usage ;;
+        *) echo "Unknown option: $arg"; usage ;;
+    esac
+done
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+DEPLOY_HOST="${DEPLOY_HOST:-}"
+DEPLOY_USER="${DEPLOY_USER:-deploy}"
+
+if [[ -z "$DEPLOY_HOST" ]]; then
+    LOCAL=true
+fi
+
+run_cmd() {
+    if [[ "$LOCAL" == true ]]; then
+        bash -c "$1"
+    else
+        ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "$1"
+    fi
+}
+
+run_script() {
+    local script="$1"
+    shift
+    if [[ "$LOCAL" == true ]]; then
+        bash "$SCRIPT_DIR/$script" "$@"
+    else
+        ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "cd /opt/azaion-training && bash scripts/$script $*"
+    fi
+}
+
+echo "=== Azaion AI Training — Deploy ==="
+echo "Mode: $(if $LOCAL; then echo 'local'; else echo "remote ($DEPLOY_HOST)"; fi)"
+echo "Action: $(if $ROLLBACK; then echo 'rollback'; else echo 'deploy'; fi)"
+echo ""
+
+"$SCRIPT_DIR/generate-config.sh"
+
+if [[ "$ROLLBACK" == true ]]; then
+    PREV_TAGS="$SCRIPT_DIR/.previous-tags"
+    if [[ ! -f "$PREV_TAGS" ]]; then
+        echo "ERROR: No previous tags found at $PREV_TAGS — cannot rollback"
+        exit 1
+    fi
+    echo "Rolling back to previous image tags..."
+    set -a
+    source "$PREV_TAGS"
+    set +a
+fi
+
+echo "[1/4] Pulling images..."
+run_script pull-images.sh
+
+echo "[2/4] Stopping services..."
+run_script stop-services.sh
+
+echo "[3/4] Starting services..."
+run_script start-services.sh
+
+echo "[4/4] Checking health..."
+if run_script health-check.sh; then
+    echo ""
+    echo "=== Deploy successful ==="
+else
+    echo ""
+    echo "=== Health check FAILED ==="
+    echo "Run: $0 --rollback"
+    exit 1
+fi
diff --git a/scripts/generate-config.sh b/scripts/generate-config.sh
new file mode 100755
index 0000000..9cb5a89
--- /dev/null
+++ b/scripts/generate-config.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+CONFIG_FILE="$PROJECT_ROOT/config.yaml"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [--help]
+
+Generate config.yaml from environment variables.
+Sources .env from project root if present.
+EOF
+    exit 0
+}
+
+[[ "${1:-}" == "--help" ]] && usage
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+required_vars=(
+    AZAION_API_URL AZAION_API_EMAIL AZAION_API_PASSWORD
+    RABBITMQ_HOST RABBITMQ_PORT RABBITMQ_USER RABBITMQ_PASSWORD RABBITMQ_QUEUE_NAME
+    AZAION_ROOT_DIR
+)
+
+missing=()
+for var in "${required_vars[@]}"; do
+    if [[ -z "${!var:-}" ]]; then
+        missing+=("$var")
+    fi
+done
+
+if [[ ${#missing[@]} -gt 0 ]]; then
+    echo "ERROR: Missing required environment variables:"
+    printf '  %s\n' "${missing[@]}"
+    echo "Set them in .env or export them before running."
+    exit 1
+fi
+
+cat > "$CONFIG_FILE" <<YAML
+api:
+  url: '${AZAION_API_URL}'
+  email: '${AZAION_API_EMAIL}'
+  password: '${AZAION_API_PASSWORD}'
+
+queue:
+  host: '${RABBITMQ_HOST}'
+  port: ${RABBITMQ_PORT}
+  consumer_user: '${RABBITMQ_USER}'
+  consumer_pw: '${RABBITMQ_PASSWORD}'
+  name: '${RABBITMQ_QUEUE_NAME}'
+
+dirs:
+  root: '${AZAION_ROOT_DIR}'
+  data: '${AZAION_DATA_DIR:-data}'
+  data_seed: '${AZAION_DATA_SEED_DIR:-data-seed}'
+  data_deleted: '${AZAION_DATA_DELETED_DIR:-data_deleted}'
+
+training:
+  model: '${TRAINING_MODEL:-yolo26m.pt}'
+  epochs: ${TRAINING_EPOCHS:-120}
+  batch: ${TRAINING_BATCH_SIZE:-11}
+  imgsz: ${TRAINING_IMGSZ:-1280}
+  save_period: ${TRAINING_SAVE_PERIOD:-1}
+  workers: ${TRAINING_WORKERS:-24}
+
+export:
+  onnx_imgsz: ${EXPORT_ONNX_IMGSZ:-1280}
+YAML
+
+echo "Generated $CONFIG_FILE"
diff --git a/scripts/health-check.sh b/scripts/health-check.sh
new file mode 100755
index 0000000..a1bccb6
--- /dev/null
+++ b/scripts/health-check.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [--help]
+
+Check health of Azaion AI Training deployment.
+
+Checks: container status, GPU availability, disk usage, queue offset.
+Exit code 0 = healthy, 1 = unhealthy.
+EOF
+    exit 0
+}
+
+[[ "${1:-}" == "--help" ]] && usage
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
+HEALTHY=true
+
+check() {
+    local name="$1"
+    local result="$2"
+    if [[ "$result" == "OK" ]]; then
+        printf "  %-30s %s\n" "$name" "[OK]"
+    else
+        printf "  %-30s %s\n" "$name" "[FAIL] $result"
+        HEALTHY=false
+    fi
+}
+
+echo "=== Azaion AI Training — Health Check ==="
+echo ""
+
+echo "Containers:"
+for svc in annotation-queue rabbitmq; do
+    cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
+    if [[ -z "$cid" ]]; then
+        check "$svc" "container not found"
+    else
+        state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
+        if [[ "$state" == "running" ]]; then
+            check "$svc" "OK"
+        else
+            check "$svc" "state=$state"
+        fi
+    fi
+done
+echo ""
+
+echo "GPU:"
+if command -v nvidia-smi &>/dev/null; then
+    gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
+    gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
+    gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
+
+    if [[ "$gpu_temp" != "N/A" ]]; then
+        check "GPU available" "OK"
+        printf "  %-30s %s°C\n" "  Temperature" "$gpu_temp"
+        printf "  %-30s %s MiB\n" "  Memory (used/total)" "$gpu_mem"
+        printf "  %-30s %s%%\n" "  Utilization" "$gpu_util"
+        if [[ "$gpu_temp" -gt 90 ]]; then
+            check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
+        fi
+    else
+        check "GPU" "nvidia-smi failed"
+    fi
+else
+    check "GPU (nvidia-smi)" "not installed"
+fi
+echo ""
+
+echo "Disk:"
+if [[ -d "$AZAION_ROOT_DIR" ]]; then
+    disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
+    if [[ "$disk_pct" != "N/A" ]]; then
+        if [[ "$disk_pct" -gt 95 ]]; then
+            check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
+        elif [[ "$disk_pct" -gt 80 ]]; then
+            check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
+        else
+            check "Disk usage ($AZAION_ROOT_DIR)" "OK"
+        fi
+        printf "  %-30s %s%%\n" "  Usage" "$disk_pct"
+    fi
+    azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
+    printf "  %-30s %s\n" "  Total size" "$azaion_size"
+else
+    check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
+fi
+echo ""
+
+echo "Queue:"
+OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
+if [[ -f "$OFFSET_FILE" ]]; then
+    offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
+    printf "  %-30s %s\n" "Last queue offset" "$offset"
+    check "Offset file" "OK"
+else
+    check "Offset file" "not found at $OFFSET_FILE"
+fi
+echo ""
+
+echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
+
+if $HEALTHY; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/scripts/pull-images.sh b/scripts/pull-images.sh
new file mode 100755
index 0000000..aeced88
--- /dev/null
+++ b/scripts/pull-images.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [--help]
+
+Pull Azaion AI Training Docker images from the container registry.
+
+Environment:
+  DOCKER_REGISTRY     Registry URL (required)
+  DOCKER_IMAGE_TAG    Image tag to pull (default: latest)
+EOF
+    exit 0
+}
+
+[[ "${1:-}" == "--help" ]] && usage
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+DOCKER_REGISTRY="${DOCKER_REGISTRY:?DOCKER_REGISTRY is required}"
+DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-latest}"
+
+IMAGES=(
+    "${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}"
+    "${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}"
+)
+
+echo "Pulling images (tag: ${DOCKER_IMAGE_TAG})..."
+
+for image in "${IMAGES[@]}"; do
+    echo "  Pulling $image ..."
+    if docker pull "$image"; then
+        echo "  OK: $image"
+    else
+        echo "  FAILED: $image"
+        exit 1
+    fi
+done
+
+echo "All images pulled successfully."
diff --git a/scripts/start-services.sh b/scripts/start-services.sh
new file mode 100755
index 0000000..e44d51e
--- /dev/null
+++ b/scripts/start-services.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [--help]
+
+Start Azaion AI Training services via Docker Compose.
+
+Environment:
+  AZAION_ROOT_DIR     Root data directory (default: /azaion)
+EOF
+    exit 0
+}
+
+[[ "${1:-}" == "--help" ]] && usage
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
+
+dirs=(
+    "$AZAION_ROOT_DIR"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/images"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/labels"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/images"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/labels"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/images"
+    "$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/labels"
+    "$AZAION_ROOT_DIR/datasets"
+    "$AZAION_ROOT_DIR/models"
+)
+
+echo "Ensuring directory structure..."
+for d in "${dirs[@]}"; do
+    mkdir -p "$d"
+done
+
+echo "Starting services..."
+docker compose -f "$PROJECT_ROOT/docker-compose.yml" --env-file "$PROJECT_ROOT/.env" up -d
+
+echo "Waiting for containers to start..."
+sleep 5
+
+docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps
+
+echo "Services started."
diff --git a/scripts/stop-services.sh b/scripts/stop-services.sh
new file mode 100755
index 0000000..673361e
--- /dev/null
+++ b/scripts/stop-services.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+usage() {
+    cat <<EOF
+Usage: $(basename "$0") [--help]
+
+Gracefully stop Azaion AI Training services.
+Saves current image tags for rollback.
+EOF
+    exit 0
+}
+
+[[ "${1:-}" == "--help" ]] && usage
+
+if [[ -f "$PROJECT_ROOT/.env" ]]; then
+    set -a
+    source "$PROJECT_ROOT/.env"
+    set +a
+fi
+
+PREV_TAGS="$SCRIPT_DIR/.previous-tags"
+
+echo "Saving current image tags for rollback..."
+{
+    for svc in annotation-queue; do
+        cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
+        if [[ -n "$cid" ]]; then
+            img=$(docker inspect --format='{{.Config.Image}}' "$cid" 2>/dev/null || echo "unknown")
+            echo "PREV_IMAGE_${svc//-/_}=$img"
+        fi
+    done
+} > "$PREV_TAGS"
+
+echo "Stopping services (30s grace period)..."
+docker compose -f "$PROJECT_ROOT/docker-compose.yml" stop -t 30
+
+echo "Removing containers..."
+docker compose -f "$PROJECT_ROOT/docker-compose.yml" down --remove-orphans
+
+echo "Services stopped. Previous tags saved to $PREV_TAGS"
diff --git a/tests/conftest.py b/tests/conftest.py
index d4a2dff..d717976 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,6 +11,7 @@ _TEST_ROOT = _TESTS_DIR / "root"
 _DATASET_IMAGES = _TEST_ROOT / "data" / "images"
 _DATASET_LABELS = _TEST_ROOT / "data" / "labels"
 _ONNX_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion.onnx"
+_PT_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion-2025-03-10.pt"
 _CLASSES_JSON = _PROJECT_ROOT / "src" / "classes.json"
 _CONFIG_TEST = _PROJECT_ROOT / "config.test.yaml"
 _MODELS_DIR = _TEST_ROOT / "models"
@@ -88,6 +89,14 @@ def fixture_onnx_model():
     return p.read_bytes()
 
 
+@pytest.fixture(scope="session")
+def fixture_pt_model():
+    p = _PT_MODEL
+    if not p.is_file():
+        pytest.skip(f"missing pt model: {p}")
+    return str(p)
+
+
 @pytest.fixture(scope="session")
 def fixture_classes_json():
     p = _CLASSES_JSON
diff --git a/tests/test_export.py b/tests/test_export.py
index a36a271..631c4df 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -1,3 +1,4 @@
+import shutil
 import sys
 from pathlib import Path
 
@@ -180,3 +181,27 @@ class TestCoremlExport:
         # Assert
         assert len(results) == 1
         assert results[0].boxes is not None
+
+
+_INPUT_DATA = _TESTS_DIR.parent / "_docs" / "00_problem" / "input_data"
+
+
+@pytest.mark.skipif(sys.platform != "darwin", reason="CoreML requires macOS")
+class TestCoremlExportRealModel:
+    def test_export_azaion_pt_to_coreml(self, fixture_pt_model):
+        # Arrange
+        output_dir = _INPUT_DATA / "azaion.mlpackage"
+        if output_dir.exists():
+            shutil.rmtree(output_dir)
+
+        # Act
+        model = YOLO(fixture_pt_model)
+        model.export(format="coreml", imgsz=1280)
+        exported = Path(fixture_pt_model).with_suffix(".mlpackage")
+        if exported != output_dir:
+            shutil.move(str(exported), str(output_dir))
+
+        # Assert
+        assert output_dir.exists()
+        model_file = output_dir / "Data" / "com.apple.CoreML" / "model.mlmodel"
+        assert model_file.exists()