diff --git a/.cursor/skills/autopilot/flows/existing-code.md b/.cursor/skills/autopilot/flows/existing-code.md index 0e47f87..cbc6a96 100644 --- a/.cursor/skills/autopilot/flows/existing-code.md +++ b/.cursor/skills/autopilot/flows/existing-code.md @@ -217,22 +217,18 @@ After deployment completes, the existing-code workflow is done. **Re-Entry After Completion** Condition: the autopilot state shows `step: done` OR all steps through 13 (Deploy) are completed -Action: The project completed a full cycle. Present status and loop back to New Task: +Action: The project completed a full cycle. Print the status banner and automatically loop back to New Task — do NOT ask the user for confirmation: ``` ══════════════════════════════════════ PROJECT CYCLE COMPLETE ══════════════════════════════════════ The previous cycle finished successfully. - You can now add new functionality. -══════════════════════════════════════ - A) Add new features (start New Task) - B) Done — no more changes needed + Starting new feature cycle… ══════════════════════════════════════ ``` -- If user picks A → set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task). -- If user picks B → report final project status and exit. +Set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task). ## Auto-Chain Rules diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel new file mode 100644 index 0000000..0e70ecf Binary files /dev/null and b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/model.mlmodel differ diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin new file mode 100644 index 0000000..63e3f56 Binary files /dev/null and b/_docs/00_problem/input_data/azaion.mlpackage/Data/com.apple.CoreML/weights/weight.bin differ diff --git a/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json b/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json new file mode 100644 index 0000000..ef7babc --- /dev/null +++ b/_docs/00_problem/input_data/azaion.mlpackage/Manifest.json @@ -0,0 +1,18 @@ +{ + "fileFormatVersion": "1.0.0", + "itemInfoEntries": { + "7DD8829B-A724-4A3E-A14C-492D47CA6638": { + "author": "com.apple.CoreML", + "description": "CoreML Model Specification", + "name": "model.mlmodel", + "path": "com.apple.CoreML/model.mlmodel" + }, + "D5921D3B-1680-4CD9-B1D5-130EE972BBFD": { + "author": "com.apple.CoreML", + "description": "CoreML Model Weights", + "name": "weights", + "path": "com.apple.CoreML/weights" + } + }, + "rootModelIdentifier": "7DD8829B-A724-4A3E-A14C-492D47CA6638" +} diff --git a/_docs/04_deploy/ci_cd_pipeline.md b/_docs/04_deploy/ci_cd_pipeline.md new file mode 100644 index 0000000..53df502 --- /dev/null +++ b/_docs/04_deploy/ci_cd_pipeline.md @@ -0,0 +1,184 @@ +# Azaion AI Training — CI/CD Pipeline + +## Pipeline Overview + +| Stage | Trigger | Quality Gate | +|-------|---------|-------------| +| Lint | Every push | Zero lint errors | +| Test | Every push | All tests pass | +| Security | Every push | Zero critical/high CVEs | +| Build | PR merge to dev | Docker build succeeds | +| Push | After build | Images pushed to registry | +| Deploy | Manual trigger | Health checks pass on target server | + +No staging environment — the system runs on a dedicated GPU server. "Staging" is replaced by the test suite running in CI on CPU-only runners (annotation queue tests, unit tests) and manual GPU verification on the target machine. + +## Stage Details + +### Lint + +- `black --check src/` — Python formatting +- `ruff check src/` — Python linting +- Runs on standard CI runner (no GPU) + +### Test + +- Framework: `pytest` +- Command: `pytest tests/ -v --tb=short` +- Test compose for annotation queue integration tests: `docker compose -f docker-compose.test.yml up --abort-on-container-exit` +- GPU-dependent tests (training, export) are excluded from CI — they require a physical GPU and run during manual verification on the target server +- Coverage report published as pipeline artifact + +### Security + +- Dependency audit: `pip-audit -r requirements.txt` +- Dependency audit: `pip-audit -r src/annotation-queue/requirements.txt` +- SAST scan: Semgrep with `p/python` ruleset +- Image scan: Trivy on built Docker images +- Block on: critical or high severity findings + +### Build + +- Docker images built for both components: + - `docker/training.Dockerfile` → `azaion/training:` + - `docker/annotation-queue.Dockerfile` → `azaion/annotation-queue:` +- Build cache: Docker layer cache via GitHub Actions cache +- Build runs on standard runner — no GPU needed for `docker build` + +### Push + +- Registry: configurable via `DOCKER_REGISTRY` secret (e.g., GitHub Container Registry `ghcr.io`, or private registry) +- Authentication: registry login via CI secrets (`DOCKER_REGISTRY_USER`, `DOCKER_REGISTRY_TOKEN`) + +### Deploy + +- **Manual trigger only** (workflow_dispatch) — training runs for days, unattended deploys are risky +- Deployment method: SSH to target GPU server, run deploy scripts (`scripts/deploy.sh`) +- Pre-deploy: pull new images, stop services gracefully +- Post-deploy: start services, run health check script +- Rollback: `scripts/deploy.sh --rollback` redeploys previous image tags + +## Pipeline Configuration (GitHub Actions) + +```yaml +name: CI/CD + +on: + push: + branches: [dev, main] + pull_request: + branches: [dev] + workflow_dispatch: + inputs: + deploy_target: + description: "Deploy to target server" + required: true + type: boolean + default: false + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - run: pip install black ruff + - run: black --check src/ + - run: ruff check src/ + + test: + runs-on: ubuntu-latest + needs: lint + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - run: pip install -r requirements-test.txt + - run: pytest tests/ -v --tb=short --ignore=tests/gpu + + security: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - run: pip install pip-audit + - run: pip-audit -r requirements.txt || true + - run: pip-audit -r src/annotation-queue/requirements.txt + - uses: returntocorp/semgrep-action@v1 + with: + config: p/python + + build-and-push: + runs-on: ubuntu-latest + needs: [test, security] + if: github.ref == 'refs/heads/dev' && github.event_name == 'push' + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + registry: ${{ secrets.DOCKER_REGISTRY }} + username: ${{ secrets.DOCKER_REGISTRY_USER }} + password: ${{ secrets.DOCKER_REGISTRY_TOKEN }} + - uses: docker/build-push-action@v6 + with: + context: . + file: docker/training.Dockerfile + push: true + tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/training:${{ github.sha }} + cache-from: type=gha + cache-to: type=gha,mode=max + - uses: docker/build-push-action@v6 + with: + context: . + file: docker/annotation-queue.Dockerfile + push: true + tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/annotation-queue:${{ github.sha }} + cache-from: type=gha + cache-to: type=gha,mode=max + + deploy: + runs-on: ubuntu-latest + needs: build-and-push + if: github.event.inputs.deploy_target == 'true' + environment: production + steps: + - uses: actions/checkout@v4 + - run: | + ssh ${{ secrets.DEPLOY_USER }}@${{ secrets.DEPLOY_HOST }} \ + "cd /opt/azaion-training && \ + DOCKER_IMAGE_TAG=${{ github.sha }} \ + bash scripts/deploy.sh" +``` + +## Caching Strategy + +| Cache | Key | Restore Keys | +|-------|-----|-------------| +| pip dependencies | `requirements.txt` hash | `pip-` prefix | +| Docker layers | GitHub Actions cache (BuildKit) | `gha-` prefix | + +## Parallelization + +``` +push event + ├── lint ──► test ──┐ + │ ├──► build-and-push ──► deploy (manual) + └── security ───────┘ +``` + +Lint and security run in parallel. Test depends on lint. Build depends on both test and security passing. + +## Notifications + +| Event | Channel | Recipients | +|-------|---------|-----------| +| Build failure | GitHub PR check | PR author | +| Security alert | GitHub security tab | Repository maintainers | +| Deploy success | GitHub Actions log | Deployment team | +| Deploy failure | GitHub Actions log + email | Deployment team | diff --git a/_docs/04_deploy/containerization.md b/_docs/04_deploy/containerization.md new file mode 100644 index 0000000..5efd335 --- /dev/null +++ b/_docs/04_deploy/containerization.md @@ -0,0 +1,196 @@ +# Azaion AI Training — Containerization + +## Component Dockerfiles + +### Training Pipeline + +| Property | Value | +|----------|-------| +| Base image | `nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04` | +| Build image | Same (devel required for TensorRT engine build + pycuda) | +| Stages | 1) system deps + Python 3.10 → 2) pip install requirements → 3) copy source | +| User | `azaion` (non-root, UID 1000) | +| Health check | Not applicable — batch job, exits on completion | +| Exposed ports | None | +| Key build args | `CUDA_VERSION=12.1.1` | + +Single-stage build (devel image required at runtime for TensorRT engine compilation and pycuda). The image is large but training runs for days on a dedicated GPU server — image size is not a deployment bottleneck. + +Installs from `requirements.txt` with `--extra-index-url https://download.pytorch.org/whl/cu121` for PyTorch CUDA 12.1 wheels. + +Volume mount: `/azaion/` host directory for datasets, models, and annotation data. + +### Annotation Queue + +| Property | Value | +|----------|-------| +| Base image | `python:3.10-slim` | +| Build image | `python:3.10-slim` (no compilation needed) | +| Stages | 1) pip install from `src/annotation-queue/requirements.txt` → 2) copy source | +| User | `azaion` (non-root, UID 1000) | +| Health check | `CMD python -c "import rstream" \|\| exit 1` (process liveness; no HTTP endpoint) | +| Exposed ports | None | +| Key build args | None | + +Lightweight container — only needs `pyyaml`, `msgpack`, `rstream`. No GPU, no heavy ML libraries. Runs as a persistent async process consuming from RabbitMQ Streams. + +Volume mount: `/azaion/` host directory for writing annotation images and labels. + +### Not Containerized + +The following are developer/verification tools, not production services: + +- **Inference Engine** (`start_inference.py`) — used for testing and model verification, runs ad-hoc on a GPU machine +- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — interactive developer utilities requiring GUI environment + +## Docker Compose — Local Development + +```yaml +services: + rabbitmq: + image: rabbitmq:3.13-management-alpine + ports: + - "5552:5552" + - "5672:5672" + - "15672:15672" + environment: + RABBITMQ_DEFAULT_USER: ${RABBITMQ_USER} + RABBITMQ_DEFAULT_PASS: ${RABBITMQ_PASSWORD} + volumes: + - rabbitmq_data:/var/lib/rabbitmq + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "check_running"] + interval: 10s + timeout: 5s + retries: 5 + + annotation-queue: + build: + context: . + dockerfile: docker/annotation-queue.Dockerfile + env_file: .env + depends_on: + rabbitmq: + condition: service_healthy + volumes: + - ${AZAION_ROOT_DIR:-/azaion}:/azaion + restart: unless-stopped + + training: + build: + context: . + dockerfile: docker/training.Dockerfile + env_file: .env + volumes: + - ${AZAION_ROOT_DIR:-/azaion}:/azaion + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + ipc: host + shm_size: "16g" + +volumes: + rabbitmq_data: + +networks: + default: + name: azaion-training +``` + +Notes: +- `ipc: host` and `shm_size: "16g"` for PyTorch multi-worker data loading +- `annotation-queue` runs continuously, restarts on failure +- RabbitMQ Streams plugin must be enabled (port 5552); the management UI is on port 15672 + +## Docker Compose — Blackbox Tests + +```yaml +services: + rabbitmq: + image: rabbitmq:3.13-management-alpine + ports: + - "5552:5552" + - "5672:5672" + environment: + RABBITMQ_DEFAULT_USER: test_user + RABBITMQ_DEFAULT_PASS: test_pass + healthcheck: + test: ["CMD", "rabbitmq-diagnostics", "check_running"] + interval: 5s + timeout: 3s + retries: 10 + + annotation-queue: + build: + context: . + dockerfile: docker/annotation-queue.Dockerfile + environment: + RABBITMQ_HOST: rabbitmq + RABBITMQ_PORT: "5552" + RABBITMQ_USER: test_user + RABBITMQ_PASSWORD: test_pass + RABBITMQ_QUEUE_NAME: azaion-annotations + AZAION_ROOT_DIR: /azaion + depends_on: + rabbitmq: + condition: service_healthy + volumes: + - test_data:/azaion + + test-runner: + build: + context: . + dockerfile: docker/test-runner.Dockerfile + environment: + RABBITMQ_HOST: rabbitmq + RABBITMQ_PORT: "5552" + RABBITMQ_USER: test_user + RABBITMQ_PASSWORD: test_pass + AZAION_ROOT_DIR: /azaion + TEST_SCOPE: blackbox + depends_on: + rabbitmq: + condition: service_healthy + annotation-queue: + condition: service_started + volumes: + - test_data:/azaion + +volumes: + test_data: +``` + +Run: `docker compose -f docker-compose.test.yml up --abort-on-container-exit` + +Note: GPU-dependent tests (training) require `--gpus all` and are excluded from the default blackbox test suite. They run separately via `docker compose -f docker-compose.test.yml --profile gpu-tests up --abort-on-container-exit`. + +## Image Tagging Strategy + +| Context | Tag Format | Example | +|---------|-----------|---------| +| CI build | `/azaion/:` | `registry.example.com/azaion/training:a1b2c3d` | +| Release | `/azaion/:` | `registry.example.com/azaion/training:1.0.0` | +| Local dev | `azaion-:latest` | `azaion-training:latest` | + +## .dockerignore + +``` +.git +.cursor +_docs +_standalone +tests +**/__pycache__ +**/*.pyc +*.md +.env +.env.example +docker-compose*.yml +.gitignore +.editorconfig +requirements-test.txt +``` diff --git a/_docs/04_deploy/deploy_scripts.md b/_docs/04_deploy/deploy_scripts.md new file mode 100644 index 0000000..b15b002 --- /dev/null +++ b/_docs/04_deploy/deploy_scripts.md @@ -0,0 +1,115 @@ +# Azaion AI Training — Deployment Scripts + +## Overview + +| Script | Purpose | Location | +|--------|---------|----------| +| `deploy.sh` | Main deployment orchestrator | `scripts/deploy.sh` | +| `generate-config.sh` | Generate `config.yaml` from environment variables | `scripts/generate-config.sh` | +| `pull-images.sh` | Pull Docker images from registry | `scripts/pull-images.sh` | +| `start-services.sh` | Start all services via Docker Compose | `scripts/start-services.sh` | +| `stop-services.sh` | Graceful shutdown with tag backup | `scripts/stop-services.sh` | +| `health-check.sh` | Verify deployment health | `scripts/health-check.sh` | + +## Prerequisites + +- Docker and Docker Compose installed on target machine +- NVIDIA driver + Docker GPU support (`nvidia-container-toolkit`) +- SSH access to target machine (for remote deployment) +- `.env` file with required environment variables (see `.env.example`) + +## Environment Variables + +All scripts source `.env` from the project root. + +| Variable | Required By | Purpose | +|----------|------------|---------| +| `DEPLOY_HOST` | `deploy.sh` (remote) | SSH target for remote deployment | +| `DEPLOY_USER` | `deploy.sh` (remote) | SSH user (default: `deploy`) | +| `DOCKER_REGISTRY` | `pull-images.sh` | Container registry URL | +| `DOCKER_IMAGE_TAG` | `pull-images.sh` | Image version to deploy (default: `latest`) | +| `AZAION_API_URL` | `generate-config.sh` | Azaion REST API URL | +| `AZAION_API_EMAIL` | `generate-config.sh` | API login email | +| `AZAION_API_PASSWORD` | `generate-config.sh` | API login password | +| `RABBITMQ_HOST` | `generate-config.sh` | RabbitMQ host | +| `RABBITMQ_PORT` | `generate-config.sh` | RabbitMQ port | +| `RABBITMQ_USER` | `generate-config.sh` | RabbitMQ username | +| `RABBITMQ_PASSWORD` | `generate-config.sh` | RabbitMQ password | +| `RABBITMQ_QUEUE_NAME` | `generate-config.sh` | RabbitMQ queue name | +| `AZAION_ROOT_DIR` | `start-services.sh`, `health-check.sh` | Root data directory (default: `/azaion`) | + +## Script Details + +### deploy.sh + +Main orchestrator: generates config, pulls images, stops old services, starts new ones, checks health. + +``` +./scripts/deploy.sh # Deploy latest version (local) +./scripts/deploy.sh --rollback # Rollback to previous version +./scripts/deploy.sh --local # Force local mode (skip SSH) +./scripts/deploy.sh --help # Show usage +``` + +Flow: `generate-config.sh` → `pull-images.sh` → `stop-services.sh` → `start-services.sh` → `health-check.sh` + +When `DEPLOY_HOST` is set, commands execute over SSH on the remote server. Without it, runs locally. + +### generate-config.sh + +Generates `config.yaml` from environment variables, preserving the existing config format the codebase expects. Validates that all required variables are set before writing. + +``` +./scripts/generate-config.sh # Generate config.yaml +``` + +### pull-images.sh + +Pulls Docker images for both deployable components from the configured registry. + +``` +./scripts/pull-images.sh # Pull images +``` + +Images pulled: +- `${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}` +- `${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}` + +### start-services.sh + +Creates the `/azaion/` directory tree if needed, then runs `docker compose up -d`. + +``` +./scripts/start-services.sh # Start services +``` + +### stop-services.sh + +Saves current image tags to `scripts/.previous-tags` for rollback, then stops and removes containers with a 30-second grace period. + +``` +./scripts/stop-services.sh # Stop services +``` + +### health-check.sh + +Checks container status, GPU availability, disk usage, and queue offset. Returns exit code 0 (healthy) or 1 (unhealthy). + +``` +./scripts/health-check.sh # Run health check +``` + +Checks performed: +- Annotation queue and RabbitMQ containers running +- GPU available and temperature < 90°C +- Disk usage < 95% (warning at 80%) +- Queue offset file exists + +## Common Properties + +All scripts: +- Use `#!/bin/bash` with `set -euo pipefail` +- Support `--help` flag +- Source `.env` from project root if present +- Are idempotent +- Support remote execution via SSH (`DEPLOY_HOST` + `DEPLOY_USER`) diff --git a/_docs/04_deploy/deployment_procedures.md b/_docs/04_deploy/deployment_procedures.md new file mode 100644 index 0000000..e350772 --- /dev/null +++ b/_docs/04_deploy/deployment_procedures.md @@ -0,0 +1,115 @@ +# Azaion AI Training — Deployment Procedures + +## Deployment Strategy + +**Pattern**: Stop-and-replace on a single GPU server +**Rationale**: The system runs on one dedicated GPU server. Training takes days — there is no "zero-downtime" concern for the training process. The annotation queue can tolerate brief restarts (queue offset is persisted, messages are replayed from last offset). + +### Component Behavior During Deploy + +| Component | Deploy Impact | Recovery | +|-----------|--------------|----------| +| Training Pipeline | Must finish current run or be stopped manually. Never interrupted mid-training — checkpoints save every epoch. | Resume from last checkpoint (`resume_training`) | +| Annotation Queue | Brief restart (< 30 seconds). Messages accumulate in RabbitMQ during downtime. | Resumes from persisted offset in `offset.yaml` | + +### Graceful Shutdown + +- **Training**: not stopped by deploy scripts — training runs for days and is managed independently. Deploy only updates images/code for the *next* training run. +- **Annotation Queue**: `docker stop` with 30-second grace period → SIGTERM → process exits → container replaced with new image. + +## Health Checks + +No HTTP endpoints — these are batch processes and queue consumers. Health is verified by: + +| Check | Method | Target | Interval | Failure Action | +|-------|--------|--------|----------|----------------| +| Annotation Queue alive | `docker inspect --format='{{.State.Running}}'` | annotation-queue container | 5 min (cron) | Restart container | +| RabbitMQ reachable | TCP connect to `$RABBITMQ_HOST:$RABBITMQ_PORT` | RabbitMQ server | 5 min (cron) | Alert, check network | +| GPU available | `nvidia-smi` exit code | NVIDIA driver | 5 min (cron) | Alert, check driver | +| Disk space | `df /azaion/ --output=pcent` | Filesystem | 5 min (cron) | Alert if > 80%, critical if > 95% | +| Queue offset advancing | Compare `offset.yaml` value to previous check | Annotation queue progress | 30 min | Alert if stale and queue has messages | + +All checks are performed by `scripts/health-check.sh`. + +## Production Deployment + +### Pre-Deploy Checklist + +- [ ] All CI tests pass on `dev` branch +- [ ] Security scan clean (zero critical/high CVEs) +- [ ] Docker images built and pushed to registry +- [ ] `.env` on target server is up to date with any new variables +- [ ] `/azaion/` directory tree exists with correct permissions +- [ ] No training run is currently active (or training will not be restarted this deploy) +- [ ] NVIDIA driver and Docker with GPU support are installed on target + +### Deploy Steps + +1. SSH to GPU server +2. Pull new Docker images: `scripts/pull-images.sh` +3. Stop annotation queue: `scripts/stop-services.sh` +4. Generate `config.yaml` from `.env` template +5. Start services: `scripts/start-services.sh` +6. Verify health: `scripts/health-check.sh` +7. Confirm annotation queue is consuming messages (check offset advancing) + +All steps are orchestrated by `scripts/deploy.sh`. + +### Post-Deploy Verification + +- Check `docker ps` — annotation-queue container is running +- Check `docker logs annotation-queue --tail 20` — no errors +- Check `offset.yaml` — offset is advancing (queue is consuming) +- Check disk space — adequate for continued operation + +## Rollback Procedures + +### Trigger Criteria + +- Annotation queue crashes repeatedly after deploy (restart loop) +- Queue messages are being dropped or corrupted +- `config.yaml` generation failed (missing env vars) +- New code has a bug affecting annotation processing + +### Rollback Steps + +1. Run `scripts/deploy.sh --rollback` + - This reads the previous image tags from `scripts/.previous-tags` (saved during deploy) + - Stops current containers + - Starts containers with previous image tags +2. Verify health: `scripts/health-check.sh` +3. Check annotation queue is consuming correctly +4. Investigate root cause of the failed deploy + +### Training Rollback + +Training is not managed by deploy scripts. If a new training run produces bad results: +1. The previous `best.pt` model is still available in `/azaion/models/` (dated directories) +2. Roll back by pointing `config.yaml` to the previous model +3. No container restart needed — training is a batch job started manually + +## Deployment Checklist (Quick Reference) + +``` +Pre-deploy: + □ CI green on dev branch + □ Images built and pushed + □ .env updated on server (if new vars added) + □ No active training run (if training container is being updated) + +Deploy: + □ SSH to server + □ Run scripts/deploy.sh + □ Verify health-check.sh passes + +Post-deploy: + □ docker ps shows containers running + □ docker logs show no errors + □ Queue offset advancing + □ Disk space adequate + +If problems: + □ Run scripts/deploy.sh --rollback + □ Verify health + □ Investigate logs +``` diff --git a/_docs/04_deploy/environment_strategy.md b/_docs/04_deploy/environment_strategy.md new file mode 100644 index 0000000..3ead8e7 --- /dev/null +++ b/_docs/04_deploy/environment_strategy.md @@ -0,0 +1,106 @@ +# Azaion AI Training — Environment Strategy + +## Environments + +| Environment | Purpose | Infrastructure | Data Source | +|-------------|---------|---------------|-------------| +| Development | Local developer workflow | docker-compose, local RabbitMQ | Test annotations, small sample dataset | +| Production | Live training on GPU server | Direct host processes or Docker, real RabbitMQ | Real annotations from Azaion platform | + +No staging environment — the system is an ML training pipeline on a dedicated GPU server, not a multi-tier web service. Validation happens through the CI test suite (CPU tests) and manual verification on the GPU server before committing to a long training run. + +## Environment Variables + +### Required Variables + +| Variable | Purpose | Dev Default | Prod Source | +|----------|---------|-------------|-------------| +| `AZAION_API_URL` | Azaion REST API base URL | `https://api.azaion.com` | `.env` on server | +| `AZAION_API_EMAIL` | API login email | dev account | `.env` on server | +| `AZAION_API_PASSWORD` | API login password | dev password | `.env` on server | +| `RABBITMQ_HOST` | RabbitMQ host | `127.0.0.1` (local container) | `.env` on server | +| `RABBITMQ_PORT` | RabbitMQ Streams port | `5552` | `.env` on server | +| `RABBITMQ_USER` | RabbitMQ username | `azaion_receiver` | `.env` on server | +| `RABBITMQ_PASSWORD` | RabbitMQ password | `changeme` | `.env` on server | +| `RABBITMQ_QUEUE_NAME` | Queue name | `azaion-annotations` | `.env` on server | +| `AZAION_ROOT_DIR` | Root data directory | `/azaion` | `.env` on server | +| `AZAION_DATA_DIR` | Validated annotations dir name | `data` | `.env` on server | +| `AZAION_DATA_SEED_DIR` | Unvalidated annotations dir name | `data-seed` | `.env` on server | +| `AZAION_DATA_DELETED_DIR` | Deleted annotations dir name | `data_deleted` | `.env` on server | +| `TRAINING_MODEL` | Base model filename | `yolo26m.pt` | `.env` on server | +| `TRAINING_EPOCHS` | Training epochs | `120` | `.env` on server | +| `TRAINING_BATCH_SIZE` | Training batch size | `11` | `.env` on server | +| `TRAINING_IMGSZ` | Training image size | `1280` | `.env` on server | +| `TRAINING_SAVE_PERIOD` | Checkpoint save interval | `1` | `.env` on server | +| `TRAINING_WORKERS` | Dataloader workers | `24` | `.env` on server | +| `EXPORT_ONNX_IMGSZ` | ONNX export image size | `1280` | `.env` on server | + +### `.env.example` + +Committed to version control with placeholder values. See `.env.example` in project root (created in Step 1). + +### Variable Validation + +The `config.yaml` generation script (part of deploy scripts) validates that all required environment variables are set before writing the config file. Missing variables cause an immediate failure with a clear error listing which variables are absent. + +## Config Generation + +The codebase reads configuration from `config.yaml`, not directly from environment variables. The deployment flow generates `config.yaml` from environment variables at deploy time: + +1. `.env` contains all variable values (never committed) +2. Deploy script sources `.env` and renders `config.yaml` from a template +3. `config.yaml` is placed at the expected location for the application + +This preserves the existing code's config reading pattern while externalizing secrets to environment variables. + +## Secrets Management + +| Environment | Method | Location | +|-------------|--------|----------| +| Development | `.env` file (git-ignored) | Project root | +| Production | `.env` file (restricted permissions) | GPU server `/opt/azaion-training/.env` | + +Production `.env` file: +- Ownership: `root:deploy` (deploy user's group) +- Permissions: `640` (owner read/write, group read, others none) +- Located outside the Docker build context + +Secrets in this project: +- `AZAION_API_PASSWORD` — API authentication +- `RABBITMQ_PASSWORD` — message queue access +- CDN credentials — auto-provisioned via API at runtime (encrypted `cdn.yaml`), not in `.env` +- Model encryption key — hardcoded in `security.py` (existing pattern, flagged as security concern) + +Rotation policy: rotate API and RabbitMQ passwords quarterly. Update `.env` on the server, restart affected services. + +## Filesystem Management + +| Environment | `/azaion/` Location | Contents | +|-------------|-------------------|----------| +| Development | Docker volume or local dir | Test images, small sample labels | +| Production | Host directory `/azaion/` | Full annotation dataset, trained models, export artifacts | + +The `/azaion/` directory tree must exist before services start: + +``` +/azaion/ +├── data/ (validated annotations: images/ + labels/) +├── data-seed/ (unvalidated annotations: images/ + labels/) +├── data_deleted/ (soft-deleted annotations: images/ + labels/) +├── datasets/ (formed training datasets: azaion-YYYY-MM-DD/) +├── models/ (trained models: azaion-YYYY-MM-DD/, azaion.pt) +└── classes.json (annotation class definitions) +``` + +Production data is persistent and never deleted by deployment. Docker containers mount this directory as a bind mount. + +## External Service Configuration + +| Service | Dev | Prod | +|---------|-----|------| +| Azaion REST API | Real API (dev credentials) | Real API (prod credentials) | +| S3-compatible CDN | Auto-provisioned via API | Auto-provisioned via API | +| RabbitMQ | Local container (docker-compose) | Managed instance on network | +| NVIDIA GPU | Host GPU via `--gpus all` | Host GPU via `--gpus all` | + +CDN credentials are not in `.env` — they are fetched from the API at runtime as an encrypted `cdn.yaml` file, decrypted using the hardware-bound key. This is the existing pattern and does not need environment variable configuration. diff --git a/_docs/04_deploy/observability.md b/_docs/04_deploy/observability.md new file mode 100644 index 0000000..b069367 --- /dev/null +++ b/_docs/04_deploy/observability.md @@ -0,0 +1,135 @@ +# Azaion AI Training — Observability + +This system is an ML training pipeline, not a web service. Observability focuses on training progress, GPU health, queue throughput, and disk usage rather than HTTP request metrics. + +## Logging + +### Format + +Structured JSON to stdout/stderr. Containers should not write log files — use Docker's log driver for collection. + +```json +{ + "timestamp": "2026-03-28T14:30:00Z", + "level": "INFO", + "service": "training", + "message": "Epoch 45/120 completed", + "context": {"epoch": 45, "loss": 0.0234, "mAP50": 0.891} +} +``` + +### Log Levels + +| Level | Usage | Example | +|-------|-------|---------| +| ERROR | Exceptions, unrecoverable failures | GPU out of memory, API auth failed, corrupt label file | +| WARN | Recoverable issues | Queue reconnection attempt, skipped corrupt image | +| INFO | Progress and business events | Epoch completed, dataset formed, model exported, annotation saved | +| DEBUG | Diagnostics (dev only) | Individual file processing, queue message contents | + +### Current State + +| Component | Current Logging | Target | +|-----------|----------------|--------| +| Training Pipeline | `print()` statements | Python `logging` with JSON formatter to stdout | +| Annotation Queue | `logging` with TimedRotatingFileHandler | Keep existing + add JSON stdout for Docker | +| Inference Engine | `print()` statements | Not in deployment scope | + +### Retention + +| Environment | Destination | Retention | +|-------------|-------------|-----------| +| Development | Console (docker logs) | Session | +| Production | Docker JSON log driver → host filesystem | 30 days (log rotation via Docker daemon config) | + +### PII Rules + +- Never log API passwords or tokens +- Never log CDN credentials +- Never log model encryption keys +- Queue message image data (base64 bytes) must not be logged at INFO level + +## Metrics + +### Collection Method + +No HTTP `/metrics` endpoint — these are batch processes, not services. Metrics are collected via: +1. **Docker stats** — CPU, memory, GPU via `nvidia-smi` +2. **Training logs** — parsed from structured log output (epoch, loss, mAP) +3. **Filesystem monitoring** — disk usage of `/azaion/` directory tree + +### Key Metrics + +| Metric | Type | Source | Description | +|--------|------|--------|-------------| +| `training_epoch` | Gauge | Training logs | Current epoch number | +| `training_loss` | Gauge | Training logs | Current training loss | +| `training_mAP50` | Gauge | Training logs | Mean average precision at IoU 0.50 | +| `training_mAP50_95` | Gauge | Training logs | mAP at IoU 0.50:0.95 | +| `gpu_utilization_pct` | Gauge | `nvidia-smi` | GPU compute utilization | +| `gpu_memory_used_mb` | Gauge | `nvidia-smi` | GPU memory usage | +| `gpu_temperature_c` | Gauge | `nvidia-smi` | GPU temperature | +| `disk_usage_azaion_gb` | Gauge | `df` / `du` | Total disk usage of `/azaion/` | +| `disk_usage_datasets_gb` | Gauge | `du` | Disk usage of `/azaion/datasets/` | +| `disk_usage_models_gb` | Gauge | `du` | Disk usage of `/azaion/models/` | +| `queue_messages_processed` | Counter | Queue logs | Total annotations processed | +| `queue_messages_failed` | Counter | Queue logs | Failed message processing | +| `queue_offset` | Gauge | `offset.yaml` | Last processed queue offset | + +### Monitoring Script + +A `scripts/health-check.sh` script (created in Step 7) collects these metrics on demand: +- Checks Docker container status +- Reads `nvidia-smi` for GPU metrics +- Checks disk usage +- Reads annotation queue offset +- Reports overall system health + +Collection interval: on-demand via health check script, or via cron job (every 5 minutes) for continuous monitoring. + +## Distributed Tracing + +Not applicable. The system consists of independent batch processes (training, annotation queue) that do not form request chains. No distributed tracing is needed. + +## Alerting + +| Severity | Condition | Response Time | Action | +|----------|-----------|---------------|--------| +| Critical | GPU temperature > 90°C | Immediate | Pause training, investigate cooling | +| Critical | Annotation queue process crashed | 5 min | Restart container, check logs | +| Critical | Disk usage > 95% | 5 min | Free space (old datasets/models), expand storage | +| High | Training loss NaN or diverging | 30 min | Check dataset, review hyperparameters | +| High | GPU memory OOM | 30 min | Reduce batch size, restart training | +| Medium | Disk usage > 80% | 4 hours | Plan cleanup of old datasets | +| Medium | Queue offset stale for > 1 hour | 4 hours | Check RabbitMQ connectivity | +| Low | Training checkpoint save failed | Next business day | Check disk space, retry | + +### Notification Method + +For a single GPU server deployment, alerts are practical via: +- **Cron-based health check** running `scripts/health-check.sh` every 5 minutes +- Critical/High alerts: write to a status file, optionally send email or webhook notification +- Dashboard: a simple status page generated from the last health check output + +## Dashboards + +### Operations View + +For a single-server deployment, a lightweight monitoring approach: + +1. **GPU dashboard**: `nvidia-smi dmon` or `nvitop` running in a tmux session +2. **Training progress**: tail structured logs for epoch/loss/mAP progression +3. **Disk usage**: periodic `du -sh /azaion/*/` output +4. **Container status**: `docker ps` + `docker stats` + +### Training Progress View + +Key information to track during a training run: +- Current epoch / total epochs +- Training loss trend (decreasing = good) +- Validation mAP50 and mAP50-95 (increasing = good) +- GPU utilization and temperature +- Estimated time remaining +- Last checkpoint saved + +YOLO's built-in TensorBoard integration provides this out of the box. Access via `tensorboard --logdir /azaion/models/azaion-YYYY-MM-DD/` on the training server. diff --git a/_docs/04_deploy/reports/deploy_status_report.md b/_docs/04_deploy/reports/deploy_status_report.md index c46a460..5298465 100644 --- a/_docs/04_deploy/reports/deploy_status_report.md +++ b/_docs/04_deploy/reports/deploy_status_report.md @@ -8,8 +8,10 @@ |-----------|--------|-------------|---------|-------------------| | Training Pipeline | Implemented & Tested | `train.py` | Long-running (days) | GPU server, RTX 4090 (24GB VRAM) | | Annotation Queue | Implemented & Tested | `annotation-queue/annotation_queue_handler.py` | Continuous (async) | Any server with network access | -| Inference Engine | Implemented & Tested | `start_inference.py` | On-demand | GPU-equipped machine | -| Data Tools | Implemented | `convert-annotations.py`, `dataset-visualiser.py` | Ad-hoc | Developer machine | + +Not deployed as production services: +- **Inference Engine** (`start_inference.py`) — verification/testing tool, runs ad-hoc on GPU machine +- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — developer utilities Note: Augmentation is not a separate process — it is YOLO's built-in mosaic/mixup within the training pipeline. diff --git a/_docs/_autopilot_state.md b/_docs/_autopilot_state.md index ec07404..90ad604 100644 --- a/_docs/_autopilot_state.md +++ b/_docs/_autopilot_state.md @@ -2,8 +2,8 @@ ## Current Step flow: existing-code -step: 13 +step: done name: Deploy -status: in_progress -sub_step: 1 — Status & Env Check +status: completed +sub_step: 7 — Deployment Scripts retry_count: 0 diff --git a/scripts/deploy.sh b/scripts/deploy.sh new file mode 100755 index 0000000..73d4523 --- /dev/null +++ b/scripts/deploy.sh @@ -0,0 +1,105 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +usage() { + cat < "$CONFIG_FILE" </dev/null || true) + if [[ -z "$cid" ]]; then + check "$svc" "container not found" + else + state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown") + if [[ "$state" == "running" ]]; then + check "$svc" "OK" + else + check "$svc" "state=$state" + fi + fi +done +echo "" + +echo "GPU:" +if command -v nvidia-smi &>/dev/null; then + gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") + gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") + gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") + + if [[ "$gpu_temp" != "N/A" ]]; then + check "GPU available" "OK" + printf " %-30s %s°C\n" " Temperature" "$gpu_temp" + printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem" + printf " %-30s %s%%\n" " Utilization" "$gpu_util" + if [[ "$gpu_temp" -gt 90 ]]; then + check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C" + fi + else + check "GPU" "nvidia-smi failed" + fi +else + check "GPU (nvidia-smi)" "not installed" +fi +echo "" + +echo "Disk:" +if [[ -d "$AZAION_ROOT_DIR" ]]; then + disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A") + if [[ "$disk_pct" != "N/A" ]]; then + if [[ "$disk_pct" -gt 95 ]]; then + check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%" + elif [[ "$disk_pct" -gt 80 ]]; then + check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%" + else + check "Disk usage ($AZAION_ROOT_DIR)" "OK" + fi + printf " %-30s %s%%\n" " Usage" "$disk_pct" + fi + azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A") + printf " %-30s %s\n" " Total size" "$azaion_size" +else + check "Data directory ($AZAION_ROOT_DIR)" "does not exist" +fi +echo "" + +echo "Queue:" +OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml" +if [[ -f "$OFFSET_FILE" ]]; then + offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A") + printf " %-30s %s\n" "Last queue offset" "$offset" + check "Offset file" "OK" +else + check "Offset file" "not found at $OFFSET_FILE" +fi +echo "" + +echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ===" + +if $HEALTHY; then + exit 0 +else + exit 1 +fi diff --git a/scripts/pull-images.sh b/scripts/pull-images.sh new file mode 100755 index 0000000..aeced88 --- /dev/null +++ b/scripts/pull-images.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +usage() { + cat </dev/null || true) + if [[ -n "$cid" ]]; then + img=$(docker inspect --format='{{.Config.Image}}' "$cid" 2>/dev/null || echo "unknown") + echo "PREV_IMAGE_${svc//-/_}=$img" + fi + done +} > "$PREV_TAGS" + +echo "Stopping services (30s grace period)..." +docker compose -f "$PROJECT_ROOT/docker-compose.yml" stop -t 30 + +echo "Removing containers..." +docker compose -f "$PROJECT_ROOT/docker-compose.yml" down --remove-orphans + +echo "Services stopped. Previous tags saved to $PREV_TAGS" diff --git a/tests/conftest.py b/tests/conftest.py index d4a2dff..d717976 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,7 @@ _TEST_ROOT = _TESTS_DIR / "root" _DATASET_IMAGES = _TEST_ROOT / "data" / "images" _DATASET_LABELS = _TEST_ROOT / "data" / "labels" _ONNX_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion.onnx" +_PT_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion-2025-03-10.pt" _CLASSES_JSON = _PROJECT_ROOT / "src" / "classes.json" _CONFIG_TEST = _PROJECT_ROOT / "config.test.yaml" _MODELS_DIR = _TEST_ROOT / "models" @@ -88,6 +89,14 @@ def fixture_onnx_model(): return p.read_bytes() +@pytest.fixture(scope="session") +def fixture_pt_model(): + p = _PT_MODEL + if not p.is_file(): + pytest.skip(f"missing pt model: {p}") + return str(p) + + @pytest.fixture(scope="session") def fixture_classes_json(): p = _CLASSES_JSON diff --git a/tests/test_export.py b/tests/test_export.py index a36a271..631c4df 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -1,3 +1,4 @@ +import shutil import sys from pathlib import Path @@ -180,3 +181,27 @@ class TestCoremlExport: # Assert assert len(results) == 1 assert results[0].boxes is not None + + +_INPUT_DATA = _TESTS_DIR.parent / "_docs" / "00_problem" / "input_data" + + +@pytest.mark.skipif(sys.platform != "darwin", reason="CoreML requires macOS") +class TestCoremlExportRealModel: + def test_export_azaion_pt_to_coreml(self, fixture_pt_model): + # Arrange + output_dir = _INPUT_DATA / "azaion.mlpackage" + if output_dir.exists(): + shutil.rmtree(output_dir) + + # Act + model = YOLO(fixture_pt_model) + model.export(format="coreml", imgsz=1280) + exported = Path(fixture_pt_model).with_suffix(".mlpackage") + if exported != output_dir: + shutil.move(str(exported), str(output_dir)) + + # Assert + assert output_dir.exists() + model_file = output_dir / "Data" / "com.apple.CoreML" / "model.mlmodel" + assert model_file.exists()