Files
Project_Velocity/infrastructure/desineuron_ingress/install_gpu_ollama_watchdog.sh
2026-04-23 01:20:21 +05:30

109 lines
3.1 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
MODEL_NAME="qwen3.6:35b-a3b"
NVME_ROOT="/opt/dlami/nvme/ollama"
OLLAMA_OVERRIDE_DIR="/etc/systemd/system/ollama.service.d"
# 1. Configure Ollama to use NVME
sudo mkdir -p "${NVME_ROOT}/models" "${NVME_ROOT}/state" "${NVME_ROOT}/logs"
sudo chown -R root:root "${NVME_ROOT}"
echo "Configuring Ollama to use NVME storage at ${NVME_ROOT}/models..."
sudo mkdir -p "${OLLAMA_OVERRIDE_DIR}"
sudo tee "${OLLAMA_OVERRIDE_DIR}/override.conf" >/dev/null <<EOF
[Service]
Environment="OLLAMA_MODELS=${NVME_ROOT}/models"
Environment="OLLAMA_HOST=0.0.0.0"
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now ollama.service
# 2. Write the Hydrate Helper
HYDRATE_HELPER="/usr/local/bin/desineuron-hydrate-qwen36.sh"
echo "Creating Hydrate Helper map at $HYDRATE_HELPER"
sudo tee "$HYDRATE_HELPER" >/dev/null <<EOF
#!/usr/bin/env bash
set -euo pipefail
echo "(\$(date)) Hydrating \$1 model using ollama pull..." | sudo tee -a "${NVME_ROOT}/logs/qwen36_hydrate.log"
# This requires outward access or an Ollama compatible registry proxy
# Note: For S3-based private GGUFs, this would use s5cmd
ollama pull "\$1"
echo "(\$(date)) Hydration complete" | sudo tee -a "${NVME_ROOT}/logs/qwen36_hydrate.log"
EOF
sudo chmod 0755 "$HYDRATE_HELPER"
# 3. Write Watchdog Script
WATCHDOG_SCRIPT="/usr/local/bin/desineuron-ollama-watchdog.sh"
echo "Creating Watchdog Script map at $WATCHDOG_SCRIPT"
sudo tee "$WATCHDOG_SCRIPT" >/dev/null <<EOF
#!/usr/bin/env bash
set -euo pipefail
MODEL_NAME="${MODEL_NAME}"
OLLAMA_URL="http://127.0.0.1:11434"
if ! systemctl is-active --quiet ollama; then
systemctl restart ollama
sleep 5
fi
# Try asking Ollama if the tag exists
if ! curl -fsS "\$OLLAMA_URL/api/tags" | grep -q "\$MODEL_NAME"; then
echo "Expected model \$MODEL_NAME missing. Initiating hydration..."
# Ensure wiped ephemeral NVMe disks are scaffolded pre-hydration
sudo mkdir -p "${NVME_ROOT}/logs" "${NVME_ROOT}/models" "${NVME_ROOT}/state"
sudo chown -R ollama:ollama "${NVME_ROOT}"
/usr/local/bin/desineuron-hydrate-qwen36.sh "\$MODEL_NAME"
sleep 5
fi
# Verify final state
if curl -fsS "\$OLLAMA_URL/api/tags" | grep -q "\$MODEL_NAME"; then
echo "healthy"
exit 0
else
echo "unhealthy: Model \$MODEL_NAME failed to register" >&2
exit 1
fi
EOF
sudo chmod 0755 "$WATCHDOG_SCRIPT"
# 4. Write Watchdog Systemd Service & Timer
sudo tee "/etc/systemd/system/desineuron-ollama-watchdog.service" >/dev/null <<EOF
[Unit]
Description=Desineuron GPU Ollama Watchdog for Model $MODEL_NAME
After=network-online.target
[Service]
Type=oneshot
Environment="HOME=/root"
ExecStart=$WATCHDOG_SCRIPT
EOF
sudo tee "/etc/systemd/system/desineuron-ollama-watchdog.timer" >/dev/null <<EOF
[Unit]
Description=Watchdog run for Ollama Model $MODEL_NAME every 5 mins
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
Unit=desineuron-ollama-watchdog.service
[Install]
WantedBy=timers.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-ollama-watchdog.timer
sudo systemctl start desineuron-ollama-watchdog.service
echo "Ollama Watchdog installed and model $MODEL_NAME setup initiated."
sudo systemctl --no-pager status desineuron-ollama-watchdog.timer