mirror of
https://github.com/azaion/ai-training.git
synced 2026-04-22 06:46:35 +00:00
Update autopilot workflow and documentation for project cycle completion
- Modified the existing-code workflow to automatically loop back to New Task after project completion without user confirmation. - Updated the autopilot state to reflect the current step as `done` and status as `completed`. - Clarified the deployment status report by specifying non-deployed services and their purposes. These changes enhance the automation of task management and improve documentation clarity.
This commit is contained in:
Executable
+105
@@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
Azaion AI Training — Deployment orchestrator.
|
||||
|
||||
Options:
|
||||
--rollback Rollback to previous image tags
|
||||
--local Run locally (skip SSH, default if DEPLOY_HOST is unset)
|
||||
--help Show this help message
|
||||
|
||||
Environment:
|
||||
DEPLOY_HOST Target server for remote deployment (optional)
|
||||
DEPLOY_USER SSH user (default: deploy)
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
ROLLBACK=false
|
||||
LOCAL=false
|
||||
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--rollback) ROLLBACK=true ;;
|
||||
--local) LOCAL=true ;;
|
||||
--help) usage ;;
|
||||
*) echo "Unknown option: $arg"; usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
DEPLOY_HOST="${DEPLOY_HOST:-}"
|
||||
DEPLOY_USER="${DEPLOY_USER:-deploy}"
|
||||
|
||||
if [[ -z "$DEPLOY_HOST" ]]; then
|
||||
LOCAL=true
|
||||
fi
|
||||
|
||||
run_cmd() {
|
||||
if [[ "$LOCAL" == true ]]; then
|
||||
bash -c "$1"
|
||||
else
|
||||
ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
run_script() {
|
||||
local script="$1"
|
||||
shift
|
||||
if [[ "$LOCAL" == true ]]; then
|
||||
bash "$SCRIPT_DIR/$script" "$@"
|
||||
else
|
||||
ssh "${DEPLOY_USER}@${DEPLOY_HOST}" "cd /opt/azaion-training && bash scripts/$script $*"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== Azaion AI Training — Deploy ==="
|
||||
echo "Mode: $(if $LOCAL; then echo 'local'; else echo "remote ($DEPLOY_HOST)"; fi)"
|
||||
echo "Action: $(if $ROLLBACK; then echo 'rollback'; else echo 'deploy'; fi)"
|
||||
echo ""
|
||||
|
||||
"$SCRIPT_DIR/generate-config.sh"
|
||||
|
||||
if [[ "$ROLLBACK" == true ]]; then
|
||||
PREV_TAGS="$SCRIPT_DIR/.previous-tags"
|
||||
if [[ ! -f "$PREV_TAGS" ]]; then
|
||||
echo "ERROR: No previous tags found at $PREV_TAGS — cannot rollback"
|
||||
exit 1
|
||||
fi
|
||||
echo "Rolling back to previous image tags..."
|
||||
set -a
|
||||
source "$PREV_TAGS"
|
||||
set +a
|
||||
fi
|
||||
|
||||
echo "[1/4] Pulling images..."
|
||||
run_script pull-images.sh
|
||||
|
||||
echo "[2/4] Stopping services..."
|
||||
run_script stop-services.sh
|
||||
|
||||
echo "[3/4] Starting services..."
|
||||
run_script start-services.sh
|
||||
|
||||
echo "[4/4] Checking health..."
|
||||
if run_script health-check.sh; then
|
||||
echo ""
|
||||
echo "=== Deploy successful ==="
|
||||
else
|
||||
echo ""
|
||||
echo "=== Health check FAILED ==="
|
||||
echo "Run: $0 --rollback"
|
||||
exit 1
|
||||
fi
|
||||
Executable
+77
@@ -0,0 +1,77 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
CONFIG_FILE="$PROJECT_ROOT/config.yaml"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Generate config.yaml from environment variables.
|
||||
Sources .env from project root if present.
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
required_vars=(
|
||||
AZAION_API_URL AZAION_API_EMAIL AZAION_API_PASSWORD
|
||||
RABBITMQ_HOST RABBITMQ_PORT RABBITMQ_USER RABBITMQ_PASSWORD RABBITMQ_QUEUE_NAME
|
||||
AZAION_ROOT_DIR
|
||||
)
|
||||
|
||||
missing=()
|
||||
for var in "${required_vars[@]}"; do
|
||||
if [[ -z "${!var:-}" ]]; then
|
||||
missing+=("$var")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ ${#missing[@]} -gt 0 ]]; then
|
||||
echo "ERROR: Missing required environment variables:"
|
||||
printf ' %s\n' "${missing[@]}"
|
||||
echo "Set them in .env or export them before running."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cat > "$CONFIG_FILE" <<YAML
|
||||
api:
|
||||
url: '${AZAION_API_URL}'
|
||||
email: '${AZAION_API_EMAIL}'
|
||||
password: '${AZAION_API_PASSWORD}'
|
||||
|
||||
queue:
|
||||
host: '${RABBITMQ_HOST}'
|
||||
port: ${RABBITMQ_PORT}
|
||||
consumer_user: '${RABBITMQ_USER}'
|
||||
consumer_pw: '${RABBITMQ_PASSWORD}'
|
||||
name: '${RABBITMQ_QUEUE_NAME}'
|
||||
|
||||
dirs:
|
||||
root: '${AZAION_ROOT_DIR}'
|
||||
data: '${AZAION_DATA_DIR:-data}'
|
||||
data_seed: '${AZAION_DATA_SEED_DIR:-data-seed}'
|
||||
data_deleted: '${AZAION_DATA_DELETED_DIR:-data_deleted}'
|
||||
|
||||
training:
|
||||
model: '${TRAINING_MODEL:-yolo26m.pt}'
|
||||
epochs: ${TRAINING_EPOCHS:-120}
|
||||
batch: ${TRAINING_BATCH_SIZE:-11}
|
||||
imgsz: ${TRAINING_IMGSZ:-1280}
|
||||
save_period: ${TRAINING_SAVE_PERIOD:-1}
|
||||
workers: ${TRAINING_WORKERS:-24}
|
||||
|
||||
export:
|
||||
onnx_imgsz: ${EXPORT_ONNX_IMGSZ:-1280}
|
||||
YAML
|
||||
|
||||
echo "Generated $CONFIG_FILE"
|
||||
Executable
+119
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Check health of Azaion AI Training deployment.
|
||||
|
||||
Checks: container status, GPU availability, disk usage, queue offset.
|
||||
Exit code 0 = healthy, 1 = unhealthy.
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
||||
HEALTHY=true
|
||||
|
||||
check() {
|
||||
local name="$1"
|
||||
local result="$2"
|
||||
if [[ "$result" == "OK" ]]; then
|
||||
printf " %-30s %s\n" "$name" "[OK]"
|
||||
else
|
||||
printf " %-30s %s\n" "$name" "[FAIL] $result"
|
||||
HEALTHY=false
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== Azaion AI Training — Health Check ==="
|
||||
echo ""
|
||||
|
||||
echo "Containers:"
|
||||
for svc in annotation-queue rabbitmq; do
|
||||
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
||||
if [[ -z "$cid" ]]; then
|
||||
check "$svc" "container not found"
|
||||
else
|
||||
state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
|
||||
if [[ "$state" == "running" ]]; then
|
||||
check "$svc" "OK"
|
||||
else
|
||||
check "$svc" "state=$state"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "GPU:"
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
|
||||
if [[ "$gpu_temp" != "N/A" ]]; then
|
||||
check "GPU available" "OK"
|
||||
printf " %-30s %s°C\n" " Temperature" "$gpu_temp"
|
||||
printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem"
|
||||
printf " %-30s %s%%\n" " Utilization" "$gpu_util"
|
||||
if [[ "$gpu_temp" -gt 90 ]]; then
|
||||
check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
|
||||
fi
|
||||
else
|
||||
check "GPU" "nvidia-smi failed"
|
||||
fi
|
||||
else
|
||||
check "GPU (nvidia-smi)" "not installed"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "Disk:"
|
||||
if [[ -d "$AZAION_ROOT_DIR" ]]; then
|
||||
disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
|
||||
if [[ "$disk_pct" != "N/A" ]]; then
|
||||
if [[ "$disk_pct" -gt 95 ]]; then
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
|
||||
elif [[ "$disk_pct" -gt 80 ]]; then
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
|
||||
else
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "OK"
|
||||
fi
|
||||
printf " %-30s %s%%\n" " Usage" "$disk_pct"
|
||||
fi
|
||||
azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
|
||||
printf " %-30s %s\n" " Total size" "$azaion_size"
|
||||
else
|
||||
check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "Queue:"
|
||||
OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
|
||||
if [[ -f "$OFFSET_FILE" ]]; then
|
||||
offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
|
||||
printf " %-30s %s\n" "Last queue offset" "$offset"
|
||||
check "Offset file" "OK"
|
||||
else
|
||||
check "Offset file" "not found at $OFFSET_FILE"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
|
||||
|
||||
if $HEALTHY; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
Executable
+48
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Pull Azaion AI Training Docker images from the container registry.
|
||||
|
||||
Environment:
|
||||
DOCKER_REGISTRY Registry URL (required)
|
||||
DOCKER_IMAGE_TAG Image tag to pull (default: latest)
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
DOCKER_REGISTRY="${DOCKER_REGISTRY:?DOCKER_REGISTRY is required}"
|
||||
DOCKER_IMAGE_TAG="${DOCKER_IMAGE_TAG:-latest}"
|
||||
|
||||
IMAGES=(
|
||||
"${DOCKER_REGISTRY}/azaion/training:${DOCKER_IMAGE_TAG}"
|
||||
"${DOCKER_REGISTRY}/azaion/annotation-queue:${DOCKER_IMAGE_TAG}"
|
||||
)
|
||||
|
||||
echo "Pulling images (tag: ${DOCKER_IMAGE_TAG})..."
|
||||
|
||||
for image in "${IMAGES[@]}"; do
|
||||
echo " Pulling $image ..."
|
||||
if docker pull "$image"; then
|
||||
echo " OK: $image"
|
||||
else
|
||||
echo " FAILED: $image"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All images pulled successfully."
|
||||
Executable
+54
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Start Azaion AI Training services via Docker Compose.
|
||||
|
||||
Environment:
|
||||
AZAION_ROOT_DIR Root data directory (default: /azaion)
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
||||
|
||||
dirs=(
|
||||
"$AZAION_ROOT_DIR"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/images"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_DIR:-data}/labels"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/images"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_SEED_DIR:-data-seed}/labels"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/images"
|
||||
"$AZAION_ROOT_DIR/${AZAION_DATA_DELETED_DIR:-data_deleted}/labels"
|
||||
"$AZAION_ROOT_DIR/datasets"
|
||||
"$AZAION_ROOT_DIR/models"
|
||||
)
|
||||
|
||||
echo "Ensuring directory structure..."
|
||||
for d in "${dirs[@]}"; do
|
||||
mkdir -p "$d"
|
||||
done
|
||||
|
||||
echo "Starting services..."
|
||||
docker compose -f "$PROJECT_ROOT/docker-compose.yml" --env-file "$PROJECT_ROOT/.env" up -d
|
||||
|
||||
echo "Waiting for containers to start..."
|
||||
sleep 5
|
||||
|
||||
docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps
|
||||
|
||||
echo "Services started."
|
||||
Executable
+44
@@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Gracefully stop Azaion AI Training services.
|
||||
Saves current image tags for rollback.
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
PREV_TAGS="$SCRIPT_DIR/.previous-tags"
|
||||
|
||||
echo "Saving current image tags for rollback..."
|
||||
{
|
||||
for svc in annotation-queue; do
|
||||
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
||||
if [[ -n "$cid" ]]; then
|
||||
img=$(docker inspect --format='{{.Config.Image}}' "$cid" 2>/dev/null || echo "unknown")
|
||||
echo "PREV_IMAGE_${svc//-/_}=$img"
|
||||
fi
|
||||
done
|
||||
} > "$PREV_TAGS"
|
||||
|
||||
echo "Stopping services (30s grace period)..."
|
||||
docker compose -f "$PROJECT_ROOT/docker-compose.yml" stop -t 30
|
||||
|
||||
echo "Removing containers..."
|
||||
docker compose -f "$PROJECT_ROOT/docker-compose.yml" down --remove-orphans
|
||||
|
||||
echo "Services stopped. Previous tags saved to $PREV_TAGS"
|
||||
Reference in New Issue
Block a user