mirror of
https://github.com/azaion/ai-training.git
synced 2026-04-22 05:16:34 +00:00
aeb7f8ca8c
- Modified the existing-code workflow to automatically loop back to New Task after project completion without user confirmation. - Updated the autopilot state to reflect the current step as `done` and status as `completed`. - Clarified the deployment status report by specifying non-deployed services and their purposes. These changes enhance the automation of task management and improve documentation clarity.
120 lines
3.5 KiB
Bash
Executable File
120 lines
3.5 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $(basename "$0") [--help]
|
|
|
|
Check health of Azaion AI Training deployment.
|
|
|
|
Checks: container status, GPU availability, disk usage, queue offset.
|
|
Exit code 0 = healthy, 1 = unhealthy.
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
[[ "${1:-}" == "--help" ]] && usage
|
|
|
|
if [[ -f "$PROJECT_ROOT/.env" ]]; then
|
|
set -a
|
|
source "$PROJECT_ROOT/.env"
|
|
set +a
|
|
fi
|
|
|
|
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
|
|
HEALTHY=true
|
|
|
|
check() {
|
|
local name="$1"
|
|
local result="$2"
|
|
if [[ "$result" == "OK" ]]; then
|
|
printf " %-30s %s\n" "$name" "[OK]"
|
|
else
|
|
printf " %-30s %s\n" "$name" "[FAIL] $result"
|
|
HEALTHY=false
|
|
fi
|
|
}
|
|
|
|
echo "=== Azaion AI Training — Health Check ==="
|
|
echo ""
|
|
|
|
echo "Containers:"
|
|
for svc in annotation-queue rabbitmq; do
|
|
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
|
|
if [[ -z "$cid" ]]; then
|
|
check "$svc" "container not found"
|
|
else
|
|
state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
|
|
if [[ "$state" == "running" ]]; then
|
|
check "$svc" "OK"
|
|
else
|
|
check "$svc" "state=$state"
|
|
fi
|
|
fi
|
|
done
|
|
echo ""
|
|
|
|
echo "GPU:"
|
|
if command -v nvidia-smi &>/dev/null; then
|
|
gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
|
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
|
gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
|
|
|
|
if [[ "$gpu_temp" != "N/A" ]]; then
|
|
check "GPU available" "OK"
|
|
printf " %-30s %s°C\n" " Temperature" "$gpu_temp"
|
|
printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem"
|
|
printf " %-30s %s%%\n" " Utilization" "$gpu_util"
|
|
if [[ "$gpu_temp" -gt 90 ]]; then
|
|
check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
|
|
fi
|
|
else
|
|
check "GPU" "nvidia-smi failed"
|
|
fi
|
|
else
|
|
check "GPU (nvidia-smi)" "not installed"
|
|
fi
|
|
echo ""
|
|
|
|
echo "Disk:"
|
|
if [[ -d "$AZAION_ROOT_DIR" ]]; then
|
|
disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
|
|
if [[ "$disk_pct" != "N/A" ]]; then
|
|
if [[ "$disk_pct" -gt 95 ]]; then
|
|
check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
|
|
elif [[ "$disk_pct" -gt 80 ]]; then
|
|
check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
|
|
else
|
|
check "Disk usage ($AZAION_ROOT_DIR)" "OK"
|
|
fi
|
|
printf " %-30s %s%%\n" " Usage" "$disk_pct"
|
|
fi
|
|
azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
|
|
printf " %-30s %s\n" " Total size" "$azaion_size"
|
|
else
|
|
check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
|
|
fi
|
|
echo ""
|
|
|
|
echo "Queue:"
|
|
OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
|
|
if [[ -f "$OFFSET_FILE" ]]; then
|
|
offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
|
|
printf " %-30s %s\n" "Last queue offset" "$offset"
|
|
check "Offset file" "OK"
|
|
else
|
|
check "Offset file" "not found at $OFFSET_FILE"
|
|
fi
|
|
echo ""
|
|
|
|
echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
|
|
|
|
if $HEALTHY; then
|
|
exit 0
|
|
else
|
|
exit 1
|
|
fi
|