mirror of
https://github.com/azaion/ai-training.git
synced 2026-04-22 05:26:36 +00:00
Update autopilot workflow and documentation for project cycle completion
- Modified the existing-code workflow to automatically loop back to New Task after project completion without user confirmation. - Updated the autopilot state to reflect the current step as `done` and status as `completed`. - Clarified the deployment status report by specifying non-deployed services and their purposes. These changes enhance the automation of task management and improve documentation clarity.
This commit is contained in:
@@ -217,22 +217,18 @@ After deployment completes, the existing-code workflow is done.
|
|||||||
**Re-Entry After Completion**
|
**Re-Entry After Completion**
|
||||||
Condition: the autopilot state shows `step: done` OR all steps through 13 (Deploy) are completed
|
Condition: the autopilot state shows `step: done` OR all steps through 13 (Deploy) are completed
|
||||||
|
|
||||||
Action: The project completed a full cycle. Present status and loop back to New Task:
|
Action: The project completed a full cycle. Print the status banner and automatically loop back to New Task — do NOT ask the user for confirmation:
|
||||||
|
|
||||||
```
|
```
|
||||||
══════════════════════════════════════
|
══════════════════════════════════════
|
||||||
PROJECT CYCLE COMPLETE
|
PROJECT CYCLE COMPLETE
|
||||||
══════════════════════════════════════
|
══════════════════════════════════════
|
||||||
The previous cycle finished successfully.
|
The previous cycle finished successfully.
|
||||||
You can now add new functionality.
|
Starting new feature cycle…
|
||||||
══════════════════════════════════════
|
|
||||||
A) Add new features (start New Task)
|
|
||||||
B) Done — no more changes needed
|
|
||||||
══════════════════════════════════════
|
══════════════════════════════════════
|
||||||
```
|
```
|
||||||
|
|
||||||
- If user picks A → set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task).
|
Set `step: 8`, `status: not_started` in the state file, then auto-chain to Step 8 (New Task).
|
||||||
- If user picks B → report final project status and exit.
|
|
||||||
|
|
||||||
## Auto-Chain Rules
|
## Auto-Chain Rules
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"fileFormatVersion": "1.0.0",
|
||||||
|
"itemInfoEntries": {
|
||||||
|
"7DD8829B-A724-4A3E-A14C-492D47CA6638": {
|
||||||
|
"author": "com.apple.CoreML",
|
||||||
|
"description": "CoreML Model Specification",
|
||||||
|
"name": "model.mlmodel",
|
||||||
|
"path": "com.apple.CoreML/model.mlmodel"
|
||||||
|
},
|
||||||
|
"D5921D3B-1680-4CD9-B1D5-130EE972BBFD": {
|
||||||
|
"author": "com.apple.CoreML",
|
||||||
|
"description": "CoreML Model Weights",
|
||||||
|
"name": "weights",
|
||||||
|
"path": "com.apple.CoreML/weights"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"rootModelIdentifier": "7DD8829B-A724-4A3E-A14C-492D47CA6638"
|
||||||
|
}
|
||||||
@@ -0,0 +1,184 @@
|
|||||||
|
# Azaion AI Training — CI/CD Pipeline
|
||||||
|
|
||||||
|
## Pipeline Overview
|
||||||
|
|
||||||
|
| Stage | Trigger | Quality Gate |
|
||||||
|
|-------|---------|-------------|
|
||||||
|
| Lint | Every push | Zero lint errors |
|
||||||
|
| Test | Every push | All tests pass |
|
||||||
|
| Security | Every push | Zero critical/high CVEs |
|
||||||
|
| Build | PR merge to dev | Docker build succeeds |
|
||||||
|
| Push | After build | Images pushed to registry |
|
||||||
|
| Deploy | Manual trigger | Health checks pass on target server |
|
||||||
|
|
||||||
|
No staging environment — the system runs on a dedicated GPU server. "Staging" is replaced by the test suite running in CI on CPU-only runners (annotation queue tests, unit tests) and manual GPU verification on the target machine.
|
||||||
|
|
||||||
|
## Stage Details
|
||||||
|
|
||||||
|
### Lint
|
||||||
|
|
||||||
|
- `black --check src/` — Python formatting
|
||||||
|
- `ruff check src/` — Python linting
|
||||||
|
- Runs on standard CI runner (no GPU)
|
||||||
|
|
||||||
|
### Test
|
||||||
|
|
||||||
|
- Framework: `pytest`
|
||||||
|
- Command: `pytest tests/ -v --tb=short`
|
||||||
|
- Test compose for annotation queue integration tests: `docker compose -f docker-compose.test.yml up --abort-on-container-exit`
|
||||||
|
- GPU-dependent tests (training, export) are excluded from CI — they require a physical GPU and run during manual verification on the target server
|
||||||
|
- Coverage report published as pipeline artifact
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- Dependency audit: `pip-audit -r requirements.txt`
|
||||||
|
- Dependency audit: `pip-audit -r src/annotation-queue/requirements.txt`
|
||||||
|
- SAST scan: Semgrep with `p/python` ruleset
|
||||||
|
- Image scan: Trivy on built Docker images
|
||||||
|
- Block on: critical or high severity findings
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
- Docker images built for both components:
|
||||||
|
- `docker/training.Dockerfile` → `azaion/training:<git-sha>`
|
||||||
|
- `docker/annotation-queue.Dockerfile` → `azaion/annotation-queue:<git-sha>`
|
||||||
|
- Build cache: Docker layer cache via GitHub Actions cache
|
||||||
|
- Build runs on standard runner — no GPU needed for `docker build`
|
||||||
|
|
||||||
|
### Push
|
||||||
|
|
||||||
|
- Registry: configurable via `DOCKER_REGISTRY` secret (e.g., GitHub Container Registry `ghcr.io`, or private registry)
|
||||||
|
- Authentication: registry login via CI secrets (`DOCKER_REGISTRY_USER`, `DOCKER_REGISTRY_TOKEN`)
|
||||||
|
|
||||||
|
### Deploy
|
||||||
|
|
||||||
|
- **Manual trigger only** (workflow_dispatch) — training runs for days, unattended deploys are risky
|
||||||
|
- Deployment method: SSH to target GPU server, run deploy scripts (`scripts/deploy.sh`)
|
||||||
|
- Pre-deploy: pull new images, stop services gracefully
|
||||||
|
- Post-deploy: start services, run health check script
|
||||||
|
- Rollback: `scripts/deploy.sh --rollback` redeploys previous image tags
|
||||||
|
|
||||||
|
## Pipeline Configuration (GitHub Actions)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
name: CI/CD
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [dev, main]
|
||||||
|
pull_request:
|
||||||
|
branches: [dev]
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
deploy_target:
|
||||||
|
description: "Deploy to target server"
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
- run: pip install black ruff
|
||||||
|
- run: black --check src/
|
||||||
|
- run: ruff check src/
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: lint
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
- run: pip install -r requirements-test.txt
|
||||||
|
- run: pytest tests/ -v --tb=short --ignore=tests/gpu
|
||||||
|
|
||||||
|
security:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
- run: pip install pip-audit
|
||||||
|
- run: pip-audit -r requirements.txt || true
|
||||||
|
- run: pip-audit -r src/annotation-queue/requirements.txt
|
||||||
|
- uses: returntocorp/semgrep-action@v1
|
||||||
|
with:
|
||||||
|
config: p/python
|
||||||
|
|
||||||
|
build-and-push:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [test, security]
|
||||||
|
if: github.ref == 'refs/heads/dev' && github.event_name == 'push'
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: docker/setup-buildx-action@v3
|
||||||
|
- uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ secrets.DOCKER_REGISTRY }}
|
||||||
|
username: ${{ secrets.DOCKER_REGISTRY_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_REGISTRY_TOKEN }}
|
||||||
|
- uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: docker/training.Dockerfile
|
||||||
|
push: true
|
||||||
|
tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/training:${{ github.sha }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
- uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: docker/annotation-queue.Dockerfile
|
||||||
|
push: true
|
||||||
|
tags: ${{ secrets.DOCKER_REGISTRY }}/azaion/annotation-queue:${{ github.sha }}
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build-and-push
|
||||||
|
if: github.event.inputs.deploy_target == 'true'
|
||||||
|
environment: production
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- run: |
|
||||||
|
ssh ${{ secrets.DEPLOY_USER }}@${{ secrets.DEPLOY_HOST }} \
|
||||||
|
"cd /opt/azaion-training && \
|
||||||
|
DOCKER_IMAGE_TAG=${{ github.sha }} \
|
||||||
|
bash scripts/deploy.sh"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Caching Strategy
|
||||||
|
|
||||||
|
| Cache | Key | Restore Keys |
|
||||||
|
|-------|-----|-------------|
|
||||||
|
| pip dependencies | `requirements.txt` hash | `pip-` prefix |
|
||||||
|
| Docker layers | GitHub Actions cache (BuildKit) | `gha-` prefix |
|
||||||
|
|
||||||
|
## Parallelization
|
||||||
|
|
||||||
|
```
|
||||||
|
push event
|
||||||
|
├── lint ──► test ──┐
|
||||||
|
│ ├──► build-and-push ──► deploy (manual)
|
||||||
|
└── security ───────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Lint and security run in parallel. Test depends on lint. Build depends on both test and security passing.
|
||||||
|
|
||||||
|
## Notifications
|
||||||
|
|
||||||
|
| Event | Channel | Recipients |
|
||||||
|
|-------|---------|-----------|
|
||||||
|
| Build failure | GitHub PR check | PR author |
|
||||||
|
| Security alert | GitHub security tab | Repository maintainers |
|
||||||
|
| Deploy success | GitHub Actions log | Deployment team |
|
||||||
|
| Deploy failure | GitHub Actions log + email | Deployment team |
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# Azaion AI Training — Containerization
|
||||||
|
|
||||||
|
## Component Dockerfiles
|
||||||
|
|
||||||
|
### Training Pipeline
|
||||||
|
|
||||||
|
| Property | Value |
|
||||||
|
|----------|-------|
|
||||||
|
| Base image | `nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04` |
|
||||||
|
| Build image | Same (devel required for TensorRT engine build + pycuda) |
|
||||||
|
| Stages | 1) system deps + Python 3.10 → 2) pip install requirements → 3) copy source |
|
||||||
|
| User | `azaion` (non-root, UID 1000) |
|
||||||
|
| Health check | Not applicable — batch job, exits on completion |
|
||||||
|
| Exposed ports | None |
|
||||||
|
| Key build args | `CUDA_VERSION=12.1.1` |
|
||||||
|
|
||||||
|
Single-stage build (devel image required at runtime for TensorRT engine compilation and pycuda). The image is large but training runs for days on a dedicated GPU server — image size is not a deployment bottleneck.
|
||||||
|
|
||||||
|
Installs from `requirements.txt` with `--extra-index-url https://download.pytorch.org/whl/cu121` for PyTorch CUDA 12.1 wheels.
|
||||||
|
|
||||||
|
Volume mount: `/azaion/` host directory for datasets, models, and annotation data.
|
||||||
|
|
||||||
|
### Annotation Queue
|
||||||
|
|
||||||
|
| Property | Value |
|
||||||
|
|----------|-------|
|
||||||
|
| Base image | `python:3.10-slim` |
|
||||||
|
| Build image | `python:3.10-slim` (no compilation needed) |
|
||||||
|
| Stages | 1) pip install from `src/annotation-queue/requirements.txt` → 2) copy source |
|
||||||
|
| User | `azaion` (non-root, UID 1000) |
|
||||||
|
| Health check | `CMD python -c "import rstream" \|\| exit 1` (process liveness; no HTTP endpoint) |
|
||||||
|
| Exposed ports | None |
|
||||||
|
| Key build args | None |
|
||||||
|
|
||||||
|
Lightweight container — only needs `pyyaml`, `msgpack`, `rstream`. No GPU, no heavy ML libraries. Runs as a persistent async process consuming from RabbitMQ Streams.
|
||||||
|
|
||||||
|
Volume mount: `/azaion/` host directory for writing annotation images and labels.
|
||||||
|
|
||||||
|
### Not Containerized
|
||||||
|
|
||||||
|
The following are developer/verification tools, not production services:
|
||||||
|
|
||||||
|
- **Inference Engine** (`start_inference.py`) — used for testing and model verification, runs ad-hoc on a GPU machine
|
||||||
|
- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — interactive developer utilities requiring GUI environment
|
||||||
|
|
||||||
|
## Docker Compose — Local Development
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
rabbitmq:
|
||||||
|
image: rabbitmq:3.13-management-alpine
|
||||||
|
ports:
|
||||||
|
- "5552:5552"
|
||||||
|
- "5672:5672"
|
||||||
|
- "15672:15672"
|
||||||
|
environment:
|
||||||
|
RABBITMQ_DEFAULT_USER: ${RABBITMQ_USER}
|
||||||
|
RABBITMQ_DEFAULT_PASS: ${RABBITMQ_PASSWORD}
|
||||||
|
volumes:
|
||||||
|
- rabbitmq_data:/var/lib/rabbitmq
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "rabbitmq-diagnostics", "check_running"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
annotation-queue:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/annotation-queue.Dockerfile
|
||||||
|
env_file: .env
|
||||||
|
depends_on:
|
||||||
|
rabbitmq:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ${AZAION_ROOT_DIR:-/azaion}:/azaion
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
training:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/training.Dockerfile
|
||||||
|
env_file: .env
|
||||||
|
volumes:
|
||||||
|
- ${AZAION_ROOT_DIR:-/azaion}:/azaion
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
|
ipc: host
|
||||||
|
shm_size: "16g"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
rabbitmq_data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
name: azaion-training
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- `ipc: host` and `shm_size: "16g"` for PyTorch multi-worker data loading
|
||||||
|
- `annotation-queue` runs continuously, restarts on failure
|
||||||
|
- RabbitMQ Streams plugin must be enabled (port 5552); the management UI is on port 15672
|
||||||
|
|
||||||
|
## Docker Compose — Blackbox Tests
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
rabbitmq:
|
||||||
|
image: rabbitmq:3.13-management-alpine
|
||||||
|
ports:
|
||||||
|
- "5552:5552"
|
||||||
|
- "5672:5672"
|
||||||
|
environment:
|
||||||
|
RABBITMQ_DEFAULT_USER: test_user
|
||||||
|
RABBITMQ_DEFAULT_PASS: test_pass
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "rabbitmq-diagnostics", "check_running"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 10
|
||||||
|
|
||||||
|
annotation-queue:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/annotation-queue.Dockerfile
|
||||||
|
environment:
|
||||||
|
RABBITMQ_HOST: rabbitmq
|
||||||
|
RABBITMQ_PORT: "5552"
|
||||||
|
RABBITMQ_USER: test_user
|
||||||
|
RABBITMQ_PASSWORD: test_pass
|
||||||
|
RABBITMQ_QUEUE_NAME: azaion-annotations
|
||||||
|
AZAION_ROOT_DIR: /azaion
|
||||||
|
depends_on:
|
||||||
|
rabbitmq:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- test_data:/azaion
|
||||||
|
|
||||||
|
test-runner:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/test-runner.Dockerfile
|
||||||
|
environment:
|
||||||
|
RABBITMQ_HOST: rabbitmq
|
||||||
|
RABBITMQ_PORT: "5552"
|
||||||
|
RABBITMQ_USER: test_user
|
||||||
|
RABBITMQ_PASSWORD: test_pass
|
||||||
|
AZAION_ROOT_DIR: /azaion
|
||||||
|
TEST_SCOPE: blackbox
|
||||||
|
depends_on:
|
||||||
|
rabbitmq:
|
||||||
|
condition: service_healthy
|
||||||
|
annotation-queue:
|
||||||
|
condition: service_started
|
||||||
|
volumes:
|
||||||
|
- test_data:/azaion
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
test_data:
|
||||||
|
```
|
||||||
|
|
||||||
|
Run: `docker compose -f docker-compose.test.yml up --abort-on-container-exit`
|
||||||
|
|
||||||
|
Note: GPU-dependent tests (training) require `--gpus all` and are excluded from the default blackbox test suite. They run separately via `docker compose -f docker-compose.test.yml --profile gpu-tests up --abort-on-container-exit`.
|
||||||
|
|
||||||
|
## Image Tagging Strategy
|
||||||
|
|
||||||
|
| Context | Tag Format | Example |
|
||||||
|
|---------|-----------|---------|
|
||||||
|
| CI build | `<registry>/azaion/<component>:<git-sha>` | `registry.example.com/azaion/training:a1b2c3d` |
|
||||||
|
| Release | `<registry>/azaion/<component>:<semver>` | `registry.example.com/azaion/training:1.0.0` |
|
||||||
|
| Local dev | `azaion-<component>:latest` | `azaion-training:latest` |
|
||||||
|
|
||||||
|
## .dockerignore
|
||||||
|
|
||||||
|
```
|
||||||
|
.git
|
||||||
|
.cursor
|
||||||
|
_docs
|
||||||
|
_standalone
|
||||||
|
tests
|
||||||
|
**/__pycache__
|
||||||
|
**/*.pyc
|
||||||
|
*.md
|
||||||
|
.env
|
||||||
|
.env.example
|
||||||
|
docker-compose*.yml
|
||||||
|
.gitignore
|
||||||
|
.editorconfig
|
||||||
|
requirements-test.txt
|
||||||
|
```
|
||||||
@@ -0,0 +1,115 @@
|
|||||||
|
# Azaion AI Training — Deployment Scripts
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
| Script | Purpose | Location |
|
||||||
|
|--------|---------|----------|
|
||||||
|
| `deploy.sh` | Main deployment orchestrator | `scripts/deploy.sh` |
|
||||||
|
| `generate-config.sh` | Generate `config.yaml` from environment variables | `scripts/generate-config.sh` |
|
||||||
|
| `pull-images.sh` | Pull Docker images from registry | `scripts/pull-images.sh` |
|
||||||
|
| `start-services.sh` | Start all services via Docker Compose | `scripts/start-services.sh` |
|
||||||
|
| `stop-services.sh` | Graceful shutdown with tag backup | `scripts/stop-services.sh` |
|
||||||
|
| `health-check.sh` | Verify deployment health | `scripts/health-check.sh` |
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Docker and Docker Compose installed on target machine
|
||||||
|
- NVIDIA driver + Docker GPU support (`nvidia-container-toolkit`)
|
||||||
|
- SSH access to target machine (for remote deployment)
|
||||||
|
- `.env` file with required environment variables (see `.env.example`)
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
All scripts source `.env` from the project root.
|
||||||
|
|
||||||
|
| Variable | Required By | Purpose |
|
||||||
|
|----------|------------|---------|
|
||||||
|
| `DEPLOY_HOST` | `deploy.sh` (remote) | SSH target for remote deployment |
|
||||||
|
| `DEPLOY_USER` | `deploy.sh` (remote) | SSH user (default: `deploy`) |
|
||||||
|
| `DOCKER_REGISTRY` | `pull-images.sh` | Container registry URL |
|
||||||
|
| `DOCKER_IMAGE_TAG` | `pull-images.sh` | Image version to deploy (default: `latest`) |
|
||||||
|
| `AZAION_API_URL` | `generate-config.sh` | Azaion REST API URL |
|
||||||
|
| `AZAION_API_EMAIL` | `generate-config.sh` | API login email |
|
||||||
|
| `AZAION_API_PASSWORD` | `generate-config.sh` | API login password |
|
||||||
|
| `RABBITMQ_HOST` | `generate-config.sh` | RabbitMQ host |
|
||||||
|
| `RABBITMQ_PORT` | `generate-config.sh` | RabbitMQ port |
|
||||||
|
| `RABBITMQ_USER` | `generate-config.sh` | RabbitMQ username |
|
||||||
|
| `RABBITMQ_PASSWORD` | `generate-config.sh` | RabbitMQ password |
|
||||||
|
| `RABBITMQ_QUEUE_NAME` | `generate-config.sh` | RabbitMQ queue name |
|
||||||
|
| `AZAION_ROOT_DIR` | `start-services.sh`, `health-check.sh` | Root data directory (default: `/azaion`) |
|
||||||
|
|
||||||
|
## Script Details
|
||||||
|
|
||||||
|
### deploy.sh
|
||||||
|
|
||||||
|
Main orchestrator: generates config, pulls images, stops old services, starts new ones, checks health.
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/deploy.sh # Deploy latest version (local)
|
||||||
|
./scripts/deploy.sh --rollback # Rollback to previous version
|
||||||
|
./scripts/deploy.sh --local # Force local mode (skip SSH)
|
||||||
|
./scripts/deploy.sh --help # Show usage
|
||||||
|
```
|
||||||
|
|
||||||
|
Flow: `generate-config.sh` → `pull-images.sh` → `stop-services.sh` → `start-services.sh` → `health-check.sh`
|
||||||
|
|
||||||
|
When `DEPLOY_HOST` is set, commands execute over SSH on the remote server. Without it, runs locally.
|
||||||
|
|
||||||
|
### generate-config.sh
|
||||||
|
|
||||||
|
Generates `config.yaml` from environment variables, preserving the existing config format the codebase expects. Validates that all required variables are set before writing.
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/generate-config.sh # Generate config.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
### pull-images.sh
|
||||||
|
|
||||||
|
Pulls Docker images for both deployable components from the configured registry.
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/pull-images.sh # Pull images
|
||||||
|
```
|
||||||
|
|
||||||
|
Images pulled:
|
||||||
|
- `${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}`
|
||||||
|
- `${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}`
|
||||||
|
|
||||||
|
### start-services.sh
|
||||||
|
|
||||||
|
Creates the `/azaion/` directory tree if needed, then runs `docker compose up -d`.
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/start-services.sh # Start services
|
||||||
|
```
|
||||||
|
|
||||||
|
### stop-services.sh
|
||||||
|
|
||||||
|
Saves current image tags to `scripts/.previous-tags` for rollback, then stops and removes containers with a 30-second grace period.
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/stop-services.sh # Stop services
|
||||||
|
```
|
||||||
|
|
||||||
|
### health-check.sh
|
||||||
|
|
||||||
|
Checks container status, GPU availability, disk usage, and queue offset. Returns exit code 0 (healthy) or 1 (unhealthy).
|
||||||
|
|
||||||
|
```
|
||||||
|
./scripts/health-check.sh # Run health check
|
||||||
|
```
|
||||||
|
|
||||||
|
Checks performed:
|
||||||
|
- Annotation queue and RabbitMQ containers running
|
||||||
|
- GPU available and temperature < 90°C
|
||||||
|
- Disk usage < 95% (warning at 80%)
|
||||||
|
- Queue offset file exists
|
||||||
|
|
||||||
|
## Common Properties
|
||||||
|
|
||||||
|
All scripts:
|
||||||
|
- Use `#!/bin/bash` with `set -euo pipefail`
|
||||||
|
- Support `--help` flag
|
||||||
|
- Source `.env` from project root if present
|
||||||
|
- Are idempotent
|
||||||
|
- Support remote execution via SSH (`DEPLOY_HOST` + `DEPLOY_USER`)
|
||||||
@@ -0,0 +1,115 @@
|
|||||||
|
# Azaion AI Training — Deployment Procedures
|
||||||
|
|
||||||
|
## Deployment Strategy
|
||||||
|
|
||||||
|
**Pattern**: Stop-and-replace on a single GPU server
|
||||||
|
**Rationale**: The system runs on one dedicated GPU server. Training takes days — there is no "zero-downtime" concern for the training process. The annotation queue can tolerate brief restarts (queue offset is persisted, messages are replayed from last offset).
|
||||||
|
|
||||||
|
### Component Behavior During Deploy
|
||||||
|
|
||||||
|
| Component | Deploy Impact | Recovery |
|
||||||
|
|-----------|--------------|----------|
|
||||||
|
| Training Pipeline | Must finish current run or be stopped manually. Never interrupted mid-training — checkpoints save every epoch. | Resume from last checkpoint (`resume_training`) |
|
||||||
|
| Annotation Queue | Brief restart (< 30 seconds). Messages accumulate in RabbitMQ during downtime. | Resumes from persisted offset in `offset.yaml` |
|
||||||
|
|
||||||
|
### Graceful Shutdown
|
||||||
|
|
||||||
|
- **Training**: not stopped by deploy scripts — training runs for days and is managed independently. Deploy only updates images/code for the *next* training run.
|
||||||
|
- **Annotation Queue**: `docker stop` with 30-second grace period → SIGTERM → process exits → container replaced with new image.
|
||||||
|
|
||||||
|
## Health Checks
|
||||||
|
|
||||||
|
No HTTP endpoints — these are batch processes and queue consumers. Health is verified by:
|
||||||
|
|
||||||
|
| Check | Method | Target | Interval | Failure Action |
|
||||||
|
|-------|--------|--------|----------|----------------|
|
||||||
|
| Annotation Queue alive | `docker inspect --format='{{.State.Running}}'` | annotation-queue container | 5 min (cron) | Restart container |
|
||||||
|
| RabbitMQ reachable | TCP connect to `$RABBITMQ_HOST:$RABBITMQ_PORT` | RabbitMQ server | 5 min (cron) | Alert, check network |
|
||||||
|
| GPU available | `nvidia-smi` exit code | NVIDIA driver | 5 min (cron) | Alert, check driver |
|
||||||
|
| Disk space | `df /azaion/ --output=pcent` | Filesystem | 5 min (cron) | Alert if > 80%, critical if > 95% |
|
||||||
|
| Queue offset advancing | Compare `offset.yaml` value to previous check | Annotation queue progress | 30 min | Alert if stale and queue has messages |
|
||||||
|
|
||||||
|
All checks are performed by `scripts/health-check.sh`.
|
||||||
|
|
||||||
|
## Production Deployment
|
||||||
|
|
||||||
|
### Pre-Deploy Checklist
|
||||||
|
|
||||||
|
- [ ] All CI tests pass on `dev` branch
|
||||||
|
- [ ] Security scan clean (zero critical/high CVEs)
|
||||||
|
- [ ] Docker images built and pushed to registry
|
||||||
|
- [ ] `.env` on target server is up to date with any new variables
|
||||||
|
- [ ] `/azaion/` directory tree exists with correct permissions
|
||||||
|
- [ ] No training run is currently active (or training will not be restarted this deploy)
|
||||||
|
- [ ] NVIDIA driver and Docker with GPU support are installed on target
|
||||||
|
|
||||||
|
### Deploy Steps
|
||||||
|
|
||||||
|
1. SSH to GPU server
|
||||||
|
2. Pull new Docker images: `scripts/pull-images.sh`
|
||||||
|
3. Stop annotation queue: `scripts/stop-services.sh`
|
||||||
|
4. Generate `config.yaml` from `.env` template
|
||||||
|
5. Start services: `scripts/start-services.sh`
|
||||||
|
6. Verify health: `scripts/health-check.sh`
|
||||||
|
7. Confirm annotation queue is consuming messages (check offset advancing)
|
||||||
|
|
||||||
|
All steps are orchestrated by `scripts/deploy.sh`.
|
||||||
|
|
||||||
|
### Post-Deploy Verification
|
||||||
|
|
||||||
|
- Check `docker ps` — annotation-queue container is running
|
||||||
|
- Check `docker logs annotation-queue --tail 20` — no errors
|
||||||
|
- Check `offset.yaml` — offset is advancing (queue is consuming)
|
||||||
|
- Check disk space — adequate for continued operation
|
||||||
|
|
||||||
|
## Rollback Procedures
|
||||||
|
|
||||||
|
### Trigger Criteria
|
||||||
|
|
||||||
|
- Annotation queue crashes repeatedly after deploy (restart loop)
|
||||||
|
- Queue messages are being dropped or corrupted
|
||||||
|
- `config.yaml` generation failed (missing env vars)
|
||||||
|
- New code has a bug affecting annotation processing
|
||||||
|
|
||||||
|
### Rollback Steps
|
||||||
|
|
||||||
|
1. Run `scripts/deploy.sh --rollback`
|
||||||
|
- This reads the previous image tags from `scripts/.previous-tags` (saved during deploy)
|
||||||
|
- Stops current containers
|
||||||
|
- Starts containers with previous image tags
|
||||||
|
2. Verify health: `scripts/health-check.sh`
|
||||||
|
3. Check annotation queue is consuming correctly
|
||||||
|
4. Investigate root cause of the failed deploy
|
||||||
|
|
||||||
|
### Training Rollback
|
||||||
|
|
||||||
|
Training is not managed by deploy scripts. If a new training run produces bad results:
|
||||||
|
1. The previous `best.pt` model is still available in `/azaion/models/` (dated directories)
|
||||||
|
2. Roll back by pointing `config.yaml` to the previous model
|
||||||
|
3. No container restart needed — training is a batch job started manually
|
||||||
|
|
||||||
|
## Deployment Checklist (Quick Reference)
|
||||||
|
|
||||||
|
```
|
||||||
|
Pre-deploy:
|
||||||
|
□ CI green on dev branch
|
||||||
|
□ Images built and pushed
|
||||||
|
□ .env updated on server (if new vars added)
|
||||||
|
□ No active training run (if training container is being updated)
|
||||||
|
|
||||||
|
Deploy:
|
||||||
|
□ SSH to server
|
||||||
|
□ Run scripts/deploy.sh
|
||||||
|
□ Verify health-check.sh passes
|
||||||
|
|
||||||
|
Post-deploy:
|
||||||
|
□ docker ps shows containers running
|
||||||
|
□ docker logs show no errors
|
||||||
|
□ Queue offset advancing
|
||||||
|
□ Disk space adequate
|
||||||
|
|
||||||
|
If problems:
|
||||||
|
□ Run scripts/deploy.sh --rollback
|
||||||
|
□ Verify health
|
||||||
|
□ Investigate logs
|
||||||
|
```
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
# Azaion AI Training — Environment Strategy
|
||||||
|
|
||||||
|
## Environments
|
||||||
|
|
||||||
|
| Environment | Purpose | Infrastructure | Data Source |
|
||||||
|
|-------------|---------|---------------|-------------|
|
||||||
|
| Development | Local developer workflow | docker-compose, local RabbitMQ | Test annotations, small sample dataset |
|
||||||
|
| Production | Live training on GPU server | Direct host processes or Docker, real RabbitMQ | Real annotations from Azaion platform |
|
||||||
|
|
||||||
|
No staging environment — the system is an ML training pipeline on a dedicated GPU server, not a multi-tier web service. Validation happens through the CI test suite (CPU tests) and manual verification on the GPU server before committing to a long training run.
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
### Required Variables
|
||||||
|
|
||||||
|
| Variable | Purpose | Dev Default | Prod Source |
|
||||||
|
|----------|---------|-------------|-------------|
|
||||||
|
| `AZAION_API_URL` | Azaion REST API base URL | `https://api.azaion.com` | `.env` on server |
|
||||||
|
| `AZAION_API_EMAIL` | API login email | dev account | `.env` on server |
|
||||||
|
| `AZAION_API_PASSWORD` | API login password | dev password | `.env` on server |
|
||||||
|
| `RABBITMQ_HOST` | RabbitMQ host | `127.0.0.1` (local container) | `.env` on server |
|
||||||
|
| `RABBITMQ_PORT` | RabbitMQ Streams port | `5552` | `.env` on server |
|
||||||
|
| `RABBITMQ_USER` | RabbitMQ username | `azaion_receiver` | `.env` on server |
|
||||||
|
| `RABBITMQ_PASSWORD` | RabbitMQ password | `changeme` | `.env` on server |
|
||||||
|
| `RABBITMQ_QUEUE_NAME` | Queue name | `azaion-annotations` | `.env` on server |
|
||||||
|
| `AZAION_ROOT_DIR` | Root data directory | `/azaion` | `.env` on server |
|
||||||
|
| `AZAION_DATA_DIR` | Validated annotations dir name | `data` | `.env` on server |
|
||||||
|
| `AZAION_DATA_SEED_DIR` | Unvalidated annotations dir name | `data-seed` | `.env` on server |
|
||||||
|
| `AZAION_DATA_DELETED_DIR` | Deleted annotations dir name | `data_deleted` | `.env` on server |
|
||||||
|
| `TRAINING_MODEL` | Base model filename | `yolo26m.pt` | `.env` on server |
|
||||||
|
| `TRAINING_EPOCHS` | Training epochs | `120` | `.env` on server |
|
||||||
|
| `TRAINING_BATCH_SIZE` | Training batch size | `11` | `.env` on server |
|
||||||
|
| `TRAINING_IMGSZ` | Training image size | `1280` | `.env` on server |
|
||||||
|
| `TRAINING_SAVE_PERIOD` | Checkpoint save interval | `1` | `.env` on server |
|
||||||
|
| `TRAINING_WORKERS` | Dataloader workers | `24` | `.env` on server |
|
||||||
|
| `EXPORT_ONNX_IMGSZ` | ONNX export image size | `1280` | `.env` on server |
|
||||||
|
|
||||||
|
### `.env.example`
|
||||||
|
|
||||||
|
Committed to version control with placeholder values. See `.env.example` in project root (created in Step 1).
|
||||||
|
|
||||||
|
### Variable Validation
|
||||||
|
|
||||||
|
The `config.yaml` generation script (part of deploy scripts) validates that all required environment variables are set before writing the config file. Missing variables cause an immediate failure with a clear error listing which variables are absent.
|
||||||
|
|
||||||
|
## Config Generation
|
||||||
|
|
||||||
|
The codebase reads configuration from `config.yaml`, not directly from environment variables. The deployment flow generates `config.yaml` from environment variables at deploy time:
|
||||||
|
|
||||||
|
1. `.env` contains all variable values (never committed)
|
||||||
|
2. Deploy script sources `.env` and renders `config.yaml` from a template
|
||||||
|
3. `config.yaml` is placed at the expected location for the application
|
||||||
|
|
||||||
|
This preserves the existing code's config reading pattern while externalizing secrets to environment variables.
|
||||||
|
|
||||||
|
## Secrets Management
|
||||||
|
|
||||||
|
| Environment | Method | Location |
|
||||||
|
|-------------|--------|----------|
|
||||||
|
| Development | `.env` file (git-ignored) | Project root |
|
||||||
|
| Production | `.env` file (restricted permissions) | GPU server `/opt/azaion-training/.env` |
|
||||||
|
|
||||||
|
Production `.env` file:
|
||||||
|
- Ownership: `root:deploy` (deploy user's group)
|
||||||
|
- Permissions: `640` (owner read/write, group read, others none)
|
||||||
|
- Located outside the Docker build context
|
||||||
|
|
||||||
|
Secrets in this project:
|
||||||
|
- `AZAION_API_PASSWORD` — API authentication
|
||||||
|
- `RABBITMQ_PASSWORD` — message queue access
|
||||||
|
- CDN credentials — auto-provisioned via API at runtime (encrypted `cdn.yaml`), not in `.env`
|
||||||
|
- Model encryption key — hardcoded in `security.py` (existing pattern, flagged as security concern)
|
||||||
|
|
||||||
|
Rotation policy: rotate API and RabbitMQ passwords quarterly. Update `.env` on the server, restart affected services.
|
||||||
|
|
||||||
|
## Filesystem Management
|
||||||
|
|
||||||
|
| Environment | `/azaion/` Location | Contents |
|
||||||
|
|-------------|-------------------|----------|
|
||||||
|
| Development | Docker volume or local dir | Test images, small sample labels |
|
||||||
|
| Production | Host directory `/azaion/` | Full annotation dataset, trained models, export artifacts |
|
||||||
|
|
||||||
|
The `/azaion/` directory tree must exist before services start:
|
||||||
|
|
||||||
|
```
|
||||||
|
/azaion/
|
||||||
|
├── data/ (validated annotations: images/ + labels/)
|
||||||
|
├── data-seed/ (unvalidated annotations: images/ + labels/)
|
||||||
|
├── data_deleted/ (soft-deleted annotations: images/ + labels/)
|
||||||
|
├── datasets/ (formed training datasets: azaion-YYYY-MM-DD/)
|
||||||
|
├── models/ (trained models: azaion-YYYY-MM-DD/, azaion.pt)
|
||||||
|
└── classes.json (annotation class definitions)
|
||||||
|
```
|
||||||
|
|
||||||
|
Production data is persistent and never deleted by deployment. Docker containers mount this directory as a bind mount.
|
||||||
|
|
||||||
|
## External Service Configuration
|
||||||
|
|
||||||
|
| Service | Dev | Prod |
|
||||||
|
|---------|-----|------|
|
||||||
|
| Azaion REST API | Real API (dev credentials) | Real API (prod credentials) |
|
||||||
|
| S3-compatible CDN | Auto-provisioned via API | Auto-provisioned via API |
|
||||||
|
| RabbitMQ | Local container (docker-compose) | Managed instance on network |
|
||||||
|
| NVIDIA GPU | Host GPU via `--gpus all` | Host GPU via `--gpus all` |
|
||||||
|
|
||||||
|
CDN credentials are not in `.env` — they are fetched from the API at runtime as an encrypted `cdn.yaml` file, decrypted using the hardware-bound key. This is the existing pattern and does not need environment variable configuration.
|
||||||
@@ -0,0 +1,135 @@
|
|||||||
|
# Azaion AI Training — Observability
|
||||||
|
|
||||||
|
This system is an ML training pipeline, not a web service. Observability focuses on training progress, GPU health, queue throughput, and disk usage rather than HTTP request metrics.
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
|
||||||
|
### Format
|
||||||
|
|
||||||
|
Structured JSON to stdout/stderr. Containers should not write log files — use Docker's log driver for collection.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-03-28T14:30:00Z",
|
||||||
|
"level": "INFO",
|
||||||
|
"service": "training",
|
||||||
|
"message": "Epoch 45/120 completed",
|
||||||
|
"context": {"epoch": 45, "loss": 0.0234, "mAP50": 0.891}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log Levels
|
||||||
|
|
||||||
|
| Level | Usage | Example |
|
||||||
|
|-------|-------|---------|
|
||||||
|
| ERROR | Exceptions, unrecoverable failures | GPU out of memory, API auth failed, corrupt label file |
|
||||||
|
| WARN | Recoverable issues | Queue reconnection attempt, skipped corrupt image |
|
||||||
|
| INFO | Progress and business events | Epoch completed, dataset formed, model exported, annotation saved |
|
||||||
|
| DEBUG | Diagnostics (dev only) | Individual file processing, queue message contents |
|
||||||
|
|
||||||
|
### Current State
|
||||||
|
|
||||||
|
| Component | Current Logging | Target |
|
||||||
|
|-----------|----------------|--------|
|
||||||
|
| Training Pipeline | `print()` statements | Python `logging` with JSON formatter to stdout |
|
||||||
|
| Annotation Queue | `logging` with TimedRotatingFileHandler | Keep existing + add JSON stdout for Docker |
|
||||||
|
| Inference Engine | `print()` statements | Not in deployment scope |
|
||||||
|
|
||||||
|
### Retention
|
||||||
|
|
||||||
|
| Environment | Destination | Retention |
|
||||||
|
|-------------|-------------|-----------|
|
||||||
|
| Development | Console (docker logs) | Session |
|
||||||
|
| Production | Docker JSON log driver → host filesystem | 30 days (log rotation via Docker daemon config) |
|
||||||
|
|
||||||
|
### PII Rules
|
||||||
|
|
||||||
|
- Never log API passwords or tokens
|
||||||
|
- Never log CDN credentials
|
||||||
|
- Never log model encryption keys
|
||||||
|
- Queue message image data (base64 bytes) must not be logged at INFO level
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
### Collection Method
|
||||||
|
|
||||||
|
No HTTP `/metrics` endpoint — these are batch processes, not services. Metrics are collected via:
|
||||||
|
1. **Docker stats** — CPU, memory, GPU via `nvidia-smi`
|
||||||
|
2. **Training logs** — parsed from structured log output (epoch, loss, mAP)
|
||||||
|
3. **Filesystem monitoring** — disk usage of `/azaion/` directory tree
|
||||||
|
|
||||||
|
### Key Metrics
|
||||||
|
|
||||||
|
| Metric | Type | Source | Description |
|
||||||
|
|--------|------|--------|-------------|
|
||||||
|
| `training_epoch` | Gauge | Training logs | Current epoch number |
|
||||||
|
| `training_loss` | Gauge | Training logs | Current training loss |
|
||||||
|
| `training_mAP50` | Gauge | Training logs | Mean average precision at IoU 0.50 |
|
||||||
|
| `training_mAP50_95` | Gauge | Training logs | mAP at IoU 0.50:0.95 |
|
||||||
|
| `gpu_utilization_pct` | Gauge | `nvidia-smi` | GPU compute utilization |
|
||||||
|
| `gpu_memory_used_mb` | Gauge | `nvidia-smi` | GPU memory usage |
|
||||||
|
| `gpu_temperature_c` | Gauge | `nvidia-smi` | GPU temperature |
|
||||||
|
| `disk_usage_azaion_gb` | Gauge | `df` / `du` | Total disk usage of `/azaion/` |
|
||||||
|
| `disk_usage_datasets_gb` | Gauge | `du` | Disk usage of `/azaion/datasets/` |
|
||||||
|
| `disk_usage_models_gb` | Gauge | `du` | Disk usage of `/azaion/models/` |
|
||||||
|
| `queue_messages_processed` | Counter | Queue logs | Total annotations processed |
|
||||||
|
| `queue_messages_failed` | Counter | Queue logs | Failed message processing |
|
||||||
|
| `queue_offset` | Gauge | `offset.yaml` | Last processed queue offset |
|
||||||
|
|
||||||
|
### Monitoring Script
|
||||||
|
|
||||||
|
A `scripts/health-check.sh` script (created in Step 7) collects these metrics on demand:
|
||||||
|
- Checks Docker container status
|
||||||
|
- Reads `nvidia-smi` for GPU metrics
|
||||||
|
- Checks disk usage
|
||||||
|
- Reads annotation queue offset
|
||||||
|
- Reports overall system health
|
||||||
|
|
||||||
|
Collection interval: on-demand via health check script, or via cron job (every 5 minutes) for continuous monitoring.
|
||||||
|
|
||||||
|
## Distributed Tracing
|
||||||
|
|
||||||
|
Not applicable. The system consists of independent batch processes (training, annotation queue) that do not form request chains. No distributed tracing is needed.
|
||||||
|
|
||||||
|
## Alerting
|
||||||
|
|
||||||
|
| Severity | Condition | Response Time | Action |
|
||||||
|
|----------|-----------|---------------|--------|
|
||||||
|
| Critical | GPU temperature > 90°C | Immediate | Pause training, investigate cooling |
|
||||||
|
| Critical | Annotation queue process crashed | 5 min | Restart container, check logs |
|
||||||
|
| Critical | Disk usage > 95% | 5 min | Free space (old datasets/models), expand storage |
|
||||||
|
| High | Training loss NaN or diverging | 30 min | Check dataset, review hyperparameters |
|
||||||
|
| High | GPU memory OOM | 30 min | Reduce batch size, restart training |
|
||||||
|
| Medium | Disk usage > 80% | 4 hours | Plan cleanup of old datasets |
|
||||||
|
| Medium | Queue offset stale for > 1 hour | 4 hours | Check RabbitMQ connectivity |
|
||||||
|
| Low | Training checkpoint save failed | Next business day | Check disk space, retry |
|
||||||
|
|
||||||
|
### Notification Method
|
||||||
|
|
||||||
|
For a single GPU server deployment, alerts are practical via:
|
||||||
|
- **Cron-based health check** running `scripts/health-check.sh` every 5 minutes
|
||||||
|
- Critical/High alerts: write to a status file, optionally send email or webhook notification
|
||||||
|
- Dashboard: a simple status page generated from the last health check output
|
||||||
|
|
||||||
|
## Dashboards
|
||||||
|
|
||||||
|
### Operations View
|
||||||
|
|
||||||
|
For a single-server deployment, a lightweight monitoring approach:
|
||||||
|
|
||||||
|
1. **GPU dashboard**: `nvidia-smi dmon` or `nvitop` running in a tmux session
|
||||||
|
2. **Training progress**: tail structured logs for epoch/loss/mAP progression
|
||||||
|
3. **Disk usage**: periodic `du -sh /azaion/*/` output
|
||||||
|
4. **Container status**: `docker ps` + `docker stats`
|
||||||
|
|
||||||
|
### Training Progress View
|
||||||
|
|
||||||
|
Key information to track during a training run:
|
||||||
|
- Current epoch / total epochs
|
||||||
|
- Training loss trend (decreasing = good)
|
||||||
|
- Validation mAP50 and mAP50-95 (increasing = good)
|
||||||
|
- GPU utilization and temperature
|
||||||
|
- Estimated time remaining
|
||||||
|
- Last checkpoint saved
|
||||||
|
|
||||||
|
YOLO's built-in TensorBoard integration provides this out of the box. Access via `tensorboard --logdir /azaion/models/azaion-YYYY-MM-DD/` on the training server.
|
||||||
@@ -8,8 +8,10 @@
|
|||||||
|-----------|--------|-------------|---------|-------------------|
|
|-----------|--------|-------------|---------|-------------------|
|
||||||
| Training Pipeline | Implemented & Tested | `train.py` | Long-running (days) | GPU server, RTX 4090 (24GB VRAM) |
|
| Training Pipeline | Implemented & Tested | `train.py` | Long-running (days) | GPU server, RTX 4090 (24GB VRAM) |
|
||||||
| Annotation Queue | Implemented & Tested | `annotation-queue/annotation_queue_handler.py` | Continuous (async) | Any server with network access |
|
| Annotation Queue | Implemented & Tested | `annotation-queue/annotation_queue_handler.py` | Continuous (async) | Any server with network access |
|
||||||
| Inference Engine | Implemented & Tested | `start_inference.py` | On-demand | GPU-equipped machine |
|
|
||||||
| Data Tools | Implemented | `convert-annotations.py`, `dataset-visualiser.py` | Ad-hoc | Developer machine |
|
Not deployed as production services:
|
||||||
|
- **Inference Engine** (`start_inference.py`) — verification/testing tool, runs ad-hoc on GPU machine
|
||||||
|
- **Data Tools** (`convert-annotations.py`, `dataset-visualiser.py`) — developer utilities
|
||||||
|
|
||||||
Note: Augmentation is not a separate process — it is YOLO's built-in mosaic/mixup within the training pipeline.
|
Note: Augmentation is not a separate process — it is YOLO's built-in mosaic/mixup within the training pipeline.
|
||||||
|
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
## Current Step
|
## Current Step
|
||||||
flow: existing-code
|
flow: existing-code
|
||||||
step: 13
|
step: done
|
||||||
name: Deploy
|
name: Deploy
|
||||||
status: in_progress
|
status: completed
|
||||||
sub_step: 1 — Status & Env Check
|
sub_step: 7 — Deployment Scripts
|
||||||
retry_count: 0
|
retry_count: 0
|
||||||
|
|||||||
Executable
+105
@@ -0,0 +1,105 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [OPTIONS]
|
||||||
|
|
||||||
|
Azaion AI Training — Deployment orchestrator.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--rollback Rollback to previous image tags
|
||||||
|
--local Run locally (skip SSH, default if DEPLOY_HOST is unset)
|
||||||
|
--help Show this help message
|
||||||
|
|
||||||
|
Environment:
|
||||||
|
DEPLOY_HOST Target server for remote deployment (optional)
|
||||||
|
DEPLOY_USER SSH user (default: deploy)
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ROLLBACK=false
|
||||||
|
LOCAL=false
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--rollback) ROLLBACK=true ;;
|
||||||
|
--local) LOCAL=true ;;
|
||||||
|
--help) usage ;;
|
||||||
|
*) echo "Unknown option: $arg"; usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEPLOY_HOST="${DEPLOY_HOST:-}"
|
||||||
|
DEPLOY_USER="${DEPLOY_USER:-deploy}"
|
||||||
|
|
||||||
|
if [[ -z "$DEPLOY_HOST" ]]; then
|
||||||
|
LOCAL=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_cmd() {
|
||||||
|
if [[ "$LOCAL" == true ]]; then
|
||||||
|
bash -c "$1"
|
||||||
|
else
|
||||||
|
ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "$1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_script() {
|
||||||
|
local script="$1"
|
||||||
|
shift
|
||||||
|
if [[ "$LOCAL" == true ]]; then
|
||||||
|
bash "$SCRIPT_DIR/$script" "$@"
|
||||||
|
else
|
||||||
|
ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "cd /opt/azaion-training && bash scripts/$script $*"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "=== Azaion AI Training — Deploy ==="
|
||||||
|
echo "Mode: $(if $LOCAL; then echo 'local'; else echo "remote ($DEPLOY_HOST)"; fi)"
|
||||||
|
echo "Action: $(if $ROLLBACK; then echo 'rollback'; else echo 'deploy'; fi)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
"$SCRIPT_DIR/generate-config.sh"
|
||||||
|
|
||||||
|
if [[ "$ROLLBACK" == true ]]; then
|
||||||
|
PREV_TAGS="$SCRIPT_DIR/.previous-tags"
|
||||||
|
if [[ ! -f "$PREV_TAGS" ]]; then
|
||||||
|
echo "ERROR: No previous tags found at $PREV_TAGS — cannot rollback"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Rolling back to previous image tags..."
|
||||||
|
set -a
|
||||||
|
source "$PREV_TAGS"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[1/4] Pulling images..."
|
||||||
|
run_script pull-images.sh
|
||||||
|
|
||||||
|
echo "[2/4] Stopping services..."
|
||||||
|
run_script stop-services.sh
|
||||||
|
|
||||||
|
echo "[3/4] Starting services..."
|
||||||
|
run_script start-services.sh
|
||||||
|
|
||||||
|
echo "[4/4] Checking health..."
|
||||||
|
if run_script health-check.sh; then
|
||||||
|
echo ""
|
||||||
|
echo "=== Deploy successful ==="
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "=== Health check FAILED ==="
|
||||||
|
echo "Run: $0 --rollback"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Executable
+77
@@ -0,0 +1,77 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
CONFIG_FILE="$PROJECT_ROOT/config.yaml"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [--help]
|
||||||
|
|
||||||
|
Generate config.yaml from environment variables.
|
||||||
|
Sources .env from project root if present.
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${1:-}" == "--help" ]] && usage
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
required_vars=(
|
||||||
|
AZAION_API_URL AZAION_API_EMAIL AZAION_API_PASSWORD
|
||||||
|
RABBITMQ_HOST RABBITMQ_PORT RABBITMQ_USER RABBITMQ_PASSWORD RABBITMQ_QUEUE_NAME
|
||||||
|
AZAION_ROOT_DIR
|
||||||
|
)
|
||||||
|
|
||||||
|
missing=()
|
||||||
|
for var in "${required_vars[@]}"; do
|
||||||
|
if [[ -z "${!var:-}" ]]; then
|
||||||
|
missing+=("$var")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||||
|
echo "ERROR: Missing required environment variables:"
|
||||||
|
printf ' %s\n' "${missing[@]}"
|
||||||
|
echo "Set them in .env or export them before running."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat > "$CONFIG_FILE" <<YAML
|
||||||
|
api:
|
||||||
|
url: '${AZAION_API_URL}'
|
||||||
|
email: '${AZAION_API_EMAIL}'
|
||||||
|
password: '${AZAION_API_PASSWORD}'
|
||||||
|
|
||||||
|
queue:
|
||||||
|
host: '${RABBITMQ_HOST}'
|
||||||
|
port: ${RABBITMQ_PORT}
|
||||||
|
consumer_user: '${RABBITMQ_USER}'
|
||||||
|
consumer_pw: '${RABBITMQ_PASSWORD}'
|
||||||
|
name: '${RABBITMQ_QUEUE_NAME}'
|
||||||
|
|
||||||
|
dirs:
|
||||||
|
root: '${AZAION_ROOT_DIR}'
|
||||||
|
data: '${AZAION_DATA_DIR:-data}'
|
||||||
|
data_seed: '${AZAION_DATA_SEED_DIR:-data-seed}'
|
||||||
|
data_deleted: '${AZAION_DATA_DELETED_DIR:-data_deleted}'
|
||||||
|
|
||||||
|
training:
|
||||||
|
model: '${TRAINING_MODEL:-yolo26m.pt}'
|
||||||
|
epochs: ${TRAINING_EPOCHS:-120}
|
||||||
|
batch: ${TRAINING_BATCH_SIZE:-11}
|
||||||
|
imgsz: ${TRAINING_IMGSZ:-1280}
|
||||||
|
save_period: ${TRAINING_SAVE_PERIOD:-1}
|
||||||
|
workers: ${TRAINING_WORKERS:-24}
|
||||||
|
|
||||||
|
export:
|
||||||
|
onnx_imgsz: ${EXPORT_ONNX_IMGSZ:-1280}
|
||||||
|
YAML
|
||||||
|
|
||||||
|
echo "Generated $CONFIG_FILE"
|
||||||
Executable
+119
@@ -0,0 +1,119 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [--help]
|
||||||
|
|
||||||
|
Check health of Azaion AI Training deployment.
|
||||||
|
|
||||||
|
Checks: container status, GPU availability, disk usage, queue offset.
|
||||||
|
Exit code 0 = healthy, 1 = unhealthy.
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${1:-}" == "--help" ]] && usage
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
||||||
|
HEALTHY=true
|
||||||
|
|
||||||
|
check() {
|
||||||
|
local name="$1"
|
||||||
|
local result="$2"
|
||||||
|
if [[ "$result" == "OK" ]]; then
|
||||||
|
printf " %-30s %s\n" "$name" "[OK]"
|
||||||
|
else
|
||||||
|
printf " %-30s %s\n" "$name" "[FAIL] $result"
|
||||||
|
HEALTHY=false
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "=== Azaion AI Training — Health Check ==="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "Containers:"
|
||||||
|
for svc in annotation-queue rabbitmq; do
|
||||||
|
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
||||||
|
if [[ -z "$cid" ]]; then
|
||||||
|
check "$svc" "container not found"
|
||||||
|
else
|
||||||
|
state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
|
||||||
|
if [[ "$state" == "running" ]]; then
|
||||||
|
check "$svc" "OK"
|
||||||
|
else
|
||||||
|
check "$svc" "state=$state"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "GPU:"
|
||||||
|
if command -v nvidia-smi &>/dev/null; then
|
||||||
|
gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||||
|
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||||
|
gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||||
|
|
||||||
|
if [[ "$gpu_temp" != "N/A" ]]; then
|
||||||
|
check "GPU available" "OK"
|
||||||
|
printf " %-30s %s°C\n" " Temperature" "$gpu_temp"
|
||||||
|
printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem"
|
||||||
|
printf " %-30s %s%%\n" " Utilization" "$gpu_util"
|
||||||
|
if [[ "$gpu_temp" -gt 90 ]]; then
|
||||||
|
check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
check "GPU" "nvidia-smi failed"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
check "GPU (nvidia-smi)" "not installed"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "Disk:"
|
||||||
|
if [[ -d "$AZAION_ROOT_DIR" ]]; then
|
||||||
|
disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
|
||||||
|
if [[ "$disk_pct" != "N/A" ]]; then
|
||||||
|
if [[ "$disk_pct" -gt 95 ]]; then
|
||||||
|
check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
|
||||||
|
elif [[ "$disk_pct" -gt 80 ]]; then
|
||||||
|
check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
|
||||||
|
else
|
||||||
|
check "Disk usage ($AZAION_ROOT_DIR)" "OK"
|
||||||
|
fi
|
||||||
|
printf " %-30s %s%%\n" " Usage" "$disk_pct"
|
||||||
|
fi
|
||||||
|
azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
|
||||||
|
printf " %-30s %s\n" " Total size" "$azaion_size"
|
||||||
|
else
|
||||||
|
check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "Queue:"
|
||||||
|
OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
|
||||||
|
if [[ -f "$OFFSET_FILE" ]]; then
|
||||||
|
offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
|
||||||
|
printf " %-30s %s\n" "Last queue offset" "$offset"
|
||||||
|
check "Offset file" "OK"
|
||||||
|
else
|
||||||
|
check "Offset file" "not found at $OFFSET_FILE"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
|
||||||
|
|
||||||
|
if $HEALTHY; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [--help]
|
||||||
|
|
||||||
|
Pull Azaion AI Training Docker images from the container registry.
|
||||||
|
|
||||||
|
Environment:
|
||||||
|
DOCKER_REGISTRY Registry URL (required)
|
||||||
|
DOCKER_IMAGE_TAG Image tag to pull (default: latest)
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${1:-}" == "--help" ]] && usage
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
DOCKER_REGISTRY="${DOCKER_REGISTRY:?DOCKER_REGISTRY is required}"
|
||||||
|
DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-latest}"
|
||||||
|
|
||||||
|
IMAGES=(
|
||||||
|
"${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}"
|
||||||
|
"${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "Pulling images (tag: ${DOCKER_IMAGE_TAG})..."
|
||||||
|
|
||||||
|
for image in "${IMAGES[@]}"; do
|
||||||
|
echo " Pulling $image ..."
|
||||||
|
if docker pull "$image"; then
|
||||||
|
echo " OK: $image"
|
||||||
|
else
|
||||||
|
echo " FAILED: $image"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "All images pulled successfully."
|
||||||
Executable
+54
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [--help]
|
||||||
|
|
||||||
|
Start Azaion AI Training services via Docker Compose.
|
||||||
|
|
||||||
|
Environment:
|
||||||
|
AZAION_ROOT_DIR Root data directory (default: /azaion)
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${1:-}" == "--help" ]] && usage
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
||||||
|
|
||||||
|
dirs=(
|
||||||
|
"$AZAION_ROOT_DIR"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/images"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/labels"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/images"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/labels"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/images"
|
||||||
|
"$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/labels"
|
||||||
|
"$AZAION_ROOT_DIR/datasets"
|
||||||
|
"$AZAION_ROOT_DIR/models"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo "Ensuring directory structure..."
|
||||||
|
for d in "${dirs[@]}"; do
|
||||||
|
mkdir -p "$d"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Starting services..."
|
||||||
|
docker compose -f "$PROJECT_ROOT/docker-compose.yml" --env-file "$PROJECT_ROOT/.env" up -d
|
||||||
|
|
||||||
|
echo "Waiting for containers to start..."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps
|
||||||
|
|
||||||
|
echo "Services started."
|
||||||
Executable
+44
@@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: $(basename "$0") [--help]
|
||||||
|
|
||||||
|
Gracefully stop Azaion AI Training services.
|
||||||
|
Saves current image tags for rollback.
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
}
|
||||||
|
|
||||||
|
[[ "${1:-}" == "--help" ]] && usage
|
||||||
|
|
||||||
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||||
|
set -a
|
||||||
|
source "$PROJECT_ROOT/.env"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
PREV_TAGS="$SCRIPT_DIR/.previous-tags"
|
||||||
|
|
||||||
|
echo "Saving current image tags for rollback..."
|
||||||
|
{
|
||||||
|
for svc in annotation-queue; do
|
||||||
|
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
||||||
|
if [[ -n "$cid" ]]; then
|
||||||
|
img=$(docker inspect --format='{{.Config.Image}}' "$cid" 2>/dev/null || echo "unknown")
|
||||||
|
echo "PREV_IMAGE_${svc//-/_}=$img"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
} > "$PREV_TAGS"
|
||||||
|
|
||||||
|
echo "Stopping services (30s grace period)..."
|
||||||
|
docker compose -f "$PROJECT_ROOT/docker-compose.yml" stop -t 30
|
||||||
|
|
||||||
|
echo "Removing containers..."
|
||||||
|
docker compose -f "$PROJECT_ROOT/docker-compose.yml" down --remove-orphans
|
||||||
|
|
||||||
|
echo "Services stopped. Previous tags saved to $PREV_TAGS"
|
||||||
@@ -11,6 +11,7 @@ _TEST_ROOT = _TESTS_DIR / "root"
|
|||||||
_DATASET_IMAGES = _TEST_ROOT / "data" / "images"
|
_DATASET_IMAGES = _TEST_ROOT / "data" / "images"
|
||||||
_DATASET_LABELS = _TEST_ROOT / "data" / "labels"
|
_DATASET_LABELS = _TEST_ROOT / "data" / "labels"
|
||||||
_ONNX_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion.onnx"
|
_ONNX_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion.onnx"
|
||||||
|
_PT_MODEL = _PROJECT_ROOT / "_docs/00_problem/input_data/azaion-2025-03-10.pt"
|
||||||
_CLASSES_JSON = _PROJECT_ROOT / "src" / "classes.json"
|
_CLASSES_JSON = _PROJECT_ROOT / "src" / "classes.json"
|
||||||
_CONFIG_TEST = _PROJECT_ROOT / "config.test.yaml"
|
_CONFIG_TEST = _PROJECT_ROOT / "config.test.yaml"
|
||||||
_MODELS_DIR = _TEST_ROOT / "models"
|
_MODELS_DIR = _TEST_ROOT / "models"
|
||||||
@@ -88,6 +89,14 @@ def fixture_onnx_model():
|
|||||||
return p.read_bytes()
|
return p.read_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def fixture_pt_model():
|
||||||
|
p = _PT_MODEL
|
||||||
|
if not p.is_file():
|
||||||
|
pytest.skip(f"missing pt model: {p}")
|
||||||
|
return str(p)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def fixture_classes_json():
|
def fixture_classes_json():
|
||||||
p = _CLASSES_JSON
|
p = _CLASSES_JSON
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -180,3 +181,27 @@ class TestCoremlExport:
|
|||||||
# Assert
|
# Assert
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
assert results[0].boxes is not None
|
assert results[0].boxes is not None
|
||||||
|
|
||||||
|
|
||||||
|
_INPUT_DATA = _TESTS_DIR.parent / "_docs" / "00_problem" / "input_data"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(sys.platform != "darwin", reason="CoreML requires macOS")
|
||||||
|
class TestCoremlExportRealModel:
|
||||||
|
def test_export_azaion_pt_to_coreml(self, fixture_pt_model):
|
||||||
|
# Arrange
|
||||||
|
output_dir = _INPUT_DATA / "azaion.mlpackage"
|
||||||
|
if output_dir.exists():
|
||||||
|
shutil.rmtree(output_dir)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
model = YOLO(fixture_pt_model)
|
||||||
|
model.export(format="coreml", imgsz=1280)
|
||||||
|
exported = Path(fixture_pt_model).with_suffix(".mlpackage")
|
||||||
|
if exported != output_dir:
|
||||||
|
shutil.move(str(exported), str(output_dir))
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert output_dir.exists()
|
||||||
|
model_file = output_dir / "Data" / "com.apple.CoreML" / "model.mlmodel"
|
||||||
|
assert model_file.exists()
|
||||||
|
|||||||
Reference in New Issue
Block a user