mirror of
https://github.com/azaion/ai-training.git
synced 2026-04-23 00:56:35 +00:00
Update autopilot workflow and documentation for project cycle completion
- Modified the existing-code workflow to automatically loop back to New Task after project completion without user confirmation. - Updated the autopilot state to reflect the current step as `done` and status as `completed`. - Clarified the deployment status report by specifying non-deployed services and their purposes. These changes enhance the automation of task management and improve documentation clarity.
This commit is contained in:
Executable
+119
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $(basename "$0") [--help]
|
||||
|
||||
Check health of Azaion AI Training deployment.
|
||||
|
||||
Checks: container status, GPU availability, disk usage, queue offset.
|
||||
Exit code 0 = healthy, 1 = unhealthy.
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
[[ "${1:-}" == "--help" ]] && usage
|
||||
|
||||
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
||||
set -a
|
||||
source "$PROJECT_ROOT/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
||||
HEALTHY=true
|
||||
|
||||
check() {
|
||||
local name="$1"
|
||||
local result="$2"
|
||||
if [[ "$result" == "OK" ]]; then
|
||||
printf " %-30s %s\n" "$name" "[OK]"
|
||||
else
|
||||
printf " %-30s %s\n" "$name" "[FAIL] $result"
|
||||
HEALTHY=false
|
||||
fi
|
||||
}
|
||||
|
||||
echo "=== Azaion AI Training — Health Check ==="
|
||||
echo ""
|
||||
|
||||
echo "Containers:"
|
||||
for svc in annotation-queue rabbitmq; do
|
||||
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
||||
if [[ -z "$cid" ]]; then
|
||||
check "$svc" "container not found"
|
||||
else
|
||||
state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
|
||||
if [[ "$state" == "running" ]]; then
|
||||
check "$svc" "OK"
|
||||
else
|
||||
check "$svc" "state=$state"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "GPU:"
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
||||
|
||||
if [[ "$gpu_temp" != "N/A" ]]; then
|
||||
check "GPU available" "OK"
|
||||
printf " %-30s %s°C\n" " Temperature" "$gpu_temp"
|
||||
printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem"
|
||||
printf " %-30s %s%%\n" " Utilization" "$gpu_util"
|
||||
if [[ "$gpu_temp" -gt 90 ]]; then
|
||||
check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
|
||||
fi
|
||||
else
|
||||
check "GPU" "nvidia-smi failed"
|
||||
fi
|
||||
else
|
||||
check "GPU (nvidia-smi)" "not installed"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "Disk:"
|
||||
if [[ -d "$AZAION_ROOT_DIR" ]]; then
|
||||
disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
|
||||
if [[ "$disk_pct" != "N/A" ]]; then
|
||||
if [[ "$disk_pct" -gt 95 ]]; then
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
|
||||
elif [[ "$disk_pct" -gt 80 ]]; then
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
|
||||
else
|
||||
check "Disk usage ($AZAION_ROOT_DIR)" "OK"
|
||||
fi
|
||||
printf " %-30s %s%%\n" " Usage" "$disk_pct"
|
||||
fi
|
||||
azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
|
||||
printf " %-30s %s\n" " Total size" "$azaion_size"
|
||||
else
|
||||
check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "Queue:"
|
||||
OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
|
||||
if [[ -f "$OFFSET_FILE" ]]; then
|
||||
offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
|
||||
printf " %-30s %s\n" "Last queue offset" "$offset"
|
||||
check "Offset file" "OK"
|
||||
else
|
||||
check "Offset file" "not found at $OFFSET_FILE"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
|
||||
|
||||
if $HEALTHY; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user