Files
Velocity-OS/infrastructure/model-hydration/install_gpu_sglang_watchdog.sh

86 lines
2.5 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
sudo tee /usr/local/bin/desineuron-sglang-watchdog.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
source /etc/default/desineuron-sglang
HEALTH_URL="http://127.0.0.1:${SGLANG_PORT}/v1/models"
HYDRATE_HELPER="/usr/local/bin/desineuron-sglang-hydrate.sh"
STARTUP_GRACE_SECONDS="${SGLANG_STARTUP_GRACE_SECONDS:-900}"
HEALTH_TIMEOUT_SECONDS="${SGLANG_HEALTH_TIMEOUT_SECONDS:-60}"
if [[ ! -d "${SGLANG_MODEL_PATH}" ]]; then
"${HYDRATE_HELPER}" "${SGLANG_MODEL_ID}" "${SGLANG_MODEL_PATH}"
fi
if ! systemctl is-active --quiet desineuron-sglang.service; then
systemctl restart desineuron-sglang.service
sleep 10
fi
main_pid="$(systemctl show -p MainPID --value desineuron-sglang.service || true)"
if [[ -n "${main_pid}" && "${main_pid}" != "0" ]]; then
runtime_age="$(( $(date +%s) - $(stat -c %Y "/proc/${main_pid}" 2>/dev/null || date +%s) ))"
if (( runtime_age < STARTUP_GRACE_SECONDS )); then
echo "startup_grace"
exit 0
fi
fi
if ! curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null; then
systemctl restart desineuron-sglang.service
sleep 20
fi
curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null
echo "healthy"
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-watchdog.sh
sudo tee /usr/local/bin/desineuron-sglang-hydrate.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
MODEL_ID="${1:?model id required}"
TARGET_PATH="${2:?target path required}"
mkdir -p "$(dirname "${TARGET_PATH}")"
if command -v hf >/dev/null 2>&1; then
hf download "${MODEL_ID}" --local-dir "${TARGET_PATH}" --max-workers 8
else
python3 - <<PY
from huggingface_hub import snapshot_download
snapshot_download(repo_id="${MODEL_ID}", local_dir="${TARGET_PATH}", max_workers=8)
PY
fi
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-hydrate.sh
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.service >/dev/null <<EOF
[Unit]
Description=Desineuron SGLang Runtime Watchdog
After=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/desineuron-sglang-watchdog.sh
EOF
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.timer >/dev/null <<EOF
[Unit]
Description=Run the Desineuron SGLang watchdog every 5 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
Unit=desineuron-sglang-watchdog.service
[Install]
WantedBy=timers.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-sglang-watchdog.timer
sudo systemctl start desineuron-sglang-watchdog.service
sudo systemctl --no-pager --full status desineuron-sglang-watchdog.timer