Files
Oleksandr Bezdieniezhnykh aeb7f8ca8c Update autopilot workflow and documentation for project cycle completion
- Modified the existing-code workflow to automatically loop back to New Task after project completion without user confirmation.
- Updated the autopilot state to reflect the current step as `done` and status as `completed`.
- Clarified the deployment status report by specifying non-deployed services and their purposes.

These changes enhance the automation of task management and improve documentation clarity.
2026-03-29 05:02:22 +03:00

120 lines
3.5 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
usage() {
cat <<EOF
Usage: $(basename "$0") [--help]
Check health of Azaion AI Training deployment.
Checks: container status, GPU availability, disk usage, queue offset.
Exit code 0 = healthy, 1 = unhealthy.
EOF
exit 0
}
[[ "${1:-}" == "--help" ]] && usage
if [[ -f "$PROJECT_ROOT/.env" ]]; then
set -a
source "$PROJECT_ROOT/.env"
set +a
fi
AZAION_ROOT_DIR="${AZAION_ROOT_DIR:-/azaion}"
HEALTHY=true
check() {
local name="$1"
local result="$2"
if [[ "$result" == "OK" ]]; then
printf " %-30s %s\n" "$name" "[OK]"
else
printf " %-30s %s\n" "$name" "[FAIL] $result"
HEALTHY=false
fi
}
echo "=== Azaion AI Training — Health Check ==="
echo ""
echo "Containers:"
for svc in annotation-queue rabbitmq; do
cid=$(docker compose -f "$PROJECT_ROOT/docker-compose.yml" ps -q "$svc" 2>/dev/null || true)
if [[ -z "$cid" ]]; then
check "$svc" "container not found"
else
state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown")
if [[ "$state" == "running" ]]; then
check "$svc" "OK"
else
check "$svc" "state=$state"
fi
fi
done
echo ""
echo "GPU:"
if command -v nvidia-smi &>/dev/null; then
gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A")
if [[ "$gpu_temp" != "N/A" ]]; then
check "GPU available" "OK"
printf " %-30s %s°C\n" " Temperature" "$gpu_temp"
printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem"
printf " %-30s %s%%\n" " Utilization" "$gpu_util"
if [[ "$gpu_temp" -gt 90 ]]; then
check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C"
fi
else
check "GPU" "nvidia-smi failed"
fi
else
check "GPU (nvidia-smi)" "not installed"
fi
echo ""
echo "Disk:"
if [[ -d "$AZAION_ROOT_DIR" ]]; then
disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A")
if [[ "$disk_pct" != "N/A" ]]; then
if [[ "$disk_pct" -gt 95 ]]; then
check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%"
elif [[ "$disk_pct" -gt 80 ]]; then
check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%"
else
check "Disk usage ($AZAION_ROOT_DIR)" "OK"
fi
printf " %-30s %s%%\n" " Usage" "$disk_pct"
fi
azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A")
printf " %-30s %s\n" " Total size" "$azaion_size"
else
check "Data directory ($AZAION_ROOT_DIR)" "does not exist"
fi
echo ""
echo "Queue:"
OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml"
if [[ -f "$OFFSET_FILE" ]]; then
offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A")
printf " %-30s %s\n" "Last queue offset" "$offset"
check "Offset file" "OK"
else
check "Offset file" "not found at $OFFSET_FILE"
fi
echo ""
echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ==="
if $HEALTHY; then
exit 0
else
exit 1
fi