#!/bin/bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" usage() { cat </dev/null || true) if [[ -z "$cid" ]]; then check "$svc" "container not found" else state=$(docker inspect --format='{{.State.Status}}' "$cid" 2>/dev/null || echo "unknown") if [[ "$state" == "running" ]]; then check "$svc" "OK" else check "$svc" "state=$state" fi fi done echo "" echo "GPU:" if command -v nvidia-smi &>/dev/null; then gpu_temp=$(nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") gpu_mem=$(nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") gpu_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "N/A") if [[ "$gpu_temp" != "N/A" ]]; then check "GPU available" "OK" printf " %-30s %s°C\n" " Temperature" "$gpu_temp" printf " %-30s %s MiB\n" " Memory (used/total)" "$gpu_mem" printf " %-30s %s%%\n" " Utilization" "$gpu_util" if [[ "$gpu_temp" -gt 90 ]]; then check "GPU temperature" "CRITICAL: ${gpu_temp}°C > 90°C" fi else check "GPU" "nvidia-smi failed" fi else check "GPU (nvidia-smi)" "not installed" fi echo "" echo "Disk:" if [[ -d "$AZAION_ROOT_DIR" ]]; then disk_pct=$(df "$AZAION_ROOT_DIR" --output=pcent 2>/dev/null | tail -1 | tr -d ' %' || echo "N/A") if [[ "$disk_pct" != "N/A" ]]; then if [[ "$disk_pct" -gt 95 ]]; then check "Disk usage ($AZAION_ROOT_DIR)" "CRITICAL: ${disk_pct}%" elif [[ "$disk_pct" -gt 80 ]]; then check "Disk usage ($AZAION_ROOT_DIR)" "WARNING: ${disk_pct}%" else check "Disk usage ($AZAION_ROOT_DIR)" "OK" fi printf " %-30s %s%%\n" " Usage" "$disk_pct" fi azaion_size=$(du -sh "$AZAION_ROOT_DIR" 2>/dev/null | cut -f1 || echo "N/A") printf " %-30s %s\n" " Total size" "$azaion_size" else check "Data directory ($AZAION_ROOT_DIR)" "does not exist" fi echo "" echo "Queue:" OFFSET_FILE="$PROJECT_ROOT/src/annotation-queue/offset.yaml" if [[ -f "$OFFSET_FILE" ]]; then offset=$(grep 'offset_queue' "$OFFSET_FILE" 2>/dev/null | awk '{print $2}' || echo "N/A") printf " %-30s %s\n" "Last queue offset" "$offset" check "Offset file" "OK" else check "Offset file" "not found at $OFFSET_FILE" fi echo "" echo "=== Result: $(if $HEALTHY; then echo 'HEALTHY'; else echo 'UNHEALTHY'; fi) ===" if $HEALTHY; then exit 0 else exit 1 fi