#!/usr/bin/env bash
# =============================================================================
# nemoclaw_deploy.sh
# Deploys NemoClaw on the AWS G6.12xlarge instance.
# - All data/install paths on NVMe (/opt/dlami/nvme/)
# - Configures OpenShell to use existing Ollama (qwen3.5:27b, port 11434)
# - GPUs 0+1 are Ollama's. Do NOT reassign them.
# - ComfyUI owns GPUs 2+3. Do NOT touch.
# - Creates a systemd service for the NemoClaw gateway.
# =============================================================================

set -euo pipefail
NVME="/opt/dlami/nvme"
AGENT_NAME="velocity-sentinel"
OLLAMA_URL="http://127.0.0.1:11434"
OLLAMA_MODEL="qwen3.5:27b"
OPENCLAW_PORT=8080   # Port our FastAPI backend targets

echo "================================================================"
echo "  Project Velocity — NemoClaw + OpenShell Deploy Script"
echo "  Instance: G6.12xlarge  |  NVMe: $NVME"
echo "================================================================"

# ──────────────────────────────────────────────────────────────────
# 0. Safety checks
# ──────────────────────────────────────────────────────────────────
if [ "$(id -u)" -ne 0 ]; then
  echo "[ERROR] Run as root or with sudo"; exit 1
fi

if ! mountpoint -q "$NVME" 2>/dev/null && [ ! -d "$NVME" ]; then
  echo "[WARN] NVMe not mounted at $NVME — using /home/ubuntu/nvme as fallback"
  NVME="/home/ubuntu/nvme"
  mkdir -p "$NVME"
fi

echo "[✓] NVMe target: $NVME"

# Confirm Ollama is alive before proceeding
if ! curl -sf "$OLLAMA_URL/api/tags" | grep -q "qwen"; then
  echo "[WARN] Ollama at $OLLAMA_URL doesn't show qwen3.5:27b yet — proceeding anyway"
else
  echo "[✓] Ollama confirmed running with qwen3.5:27b"
fi

# ──────────────────────────────────────────────────────────────────
# 1. Node.js 22 (NemoClaw requirement: >=22.16)
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[1/7] Installing Node.js 22..."

NODE_VERSION=$(node --version 2>/dev/null | sed 's/v//' | cut -d. -f1 || echo "0")
if [ "$NODE_VERSION" -ge 22 ]; then
  echo "[✓] Node.js $(node --version) already installed"
else
  curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
  apt-get install -y nodejs
  echo "[✓] Node.js $(node --version) installed"
fi

npm --version
echo "[✓] npm $(npm --version)"

# ──────────────────────────────────────────────────────────────────
# 2. Docker (required for OpenShell container runtime)
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[2/7] Ensuring Docker is installed..."

if command -v docker &>/dev/null && docker info &>/dev/null; then
  echo "[✓] Docker $(docker --version | awk '{print $3}') already running"
else
  echo "  Installing Docker..."
  apt-get install -y ca-certificates curl gnupg lsb-release
  install -m 0755 -d /etc/apt/keyrings
  curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
  chmod a+r /etc/apt/keyrings/docker.gpg
  echo \
    "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
    https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" \
    | tee /etc/apt/sources.list.d/docker.list > /dev/null
  apt-get update -q
  apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
  systemctl enable docker
  systemctl start docker
  echo "[✓] Docker installed"
fi

# Move Docker data root to NVMe so images don't fill root disk
DOCKER_DAEMON_JSON="/etc/docker/daemon.json"
if ! grep -q "nvme" "$DOCKER_DAEMON_JSON" 2>/dev/null; then
  echo "  Moving Docker data-root → $NVME/docker"
  mkdir -p "$NVME/docker"
  # Preserve existing config if any
  EXISTING=$(cat "$DOCKER_DAEMON_JSON" 2>/dev/null || echo "{}")
  python3 -c "
import json, sys
cfg = json.loads('''$EXISTING''')
cfg['data-root'] = '$NVME/docker'
print(json.dumps(cfg, indent=2))
" > "$DOCKER_DAEMON_JSON"
  systemctl restart docker
  echo "[✓] Docker data-root → $NVME/docker"
fi

# ──────────────────────────────────────────────────────────────────
# 3. Install NemoClaw (headless via env vars)
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[3/7] Installing NemoClaw..."

# Set HOME so NemoClaw installs to NVMe-backed location
export NEMOCLAW_HOME="$NVME/nemoclaw"
export OPENSHELL_HOME="$NVME/openshell"
export HOME_OVERRIDE="$NVME/home"
mkdir -p "$NEMOCLAW_HOME" "$OPENSHELL_HOME" "$HOME_OVERRIDE"

# Link ~/.nemoclaw and ~/.openshell to NVMe
ln -sfn "$NEMOCLAW_HOME" /root/.nemoclaw 2>/dev/null || true
ln -sfn "$NEMOCLAW_HOME" /home/ubuntu/.nemoclaw 2>/dev/null || true
ln -sfn "$OPENSHELL_HOME" /root/.openshell 2>/dev/null || true
ln -sfn "$OPENSHELL_HOME" /home/ubuntu/.openshell 2>/dev/null || true

if command -v nemoclaw &>/dev/null; then
  echo "[✓] nemoclaw already installed: $(nemoclaw --version 2>/dev/null || echo 'version unknown')"
else
  echo "  Downloading NemoClaw installer..."
  INSTALLER_SCRIPT="$NVME/nemoclaw_install.sh"
  curl -fsSL https://www.nvidia.com/nemoclaw.sh -o "$INSTALLER_SCRIPT"
  chmod +x "$INSTALLER_SCRIPT"

  # Run the installer non-interactively
  # NEMOCLAW_SKIP_ONBOARD=1 bypasses the interactive wizard (undocumented but standard pattern)
  # We'll do manual onboarding after install using CLI flags
  NEMOCLAW_SKIP_ONBOARD=1 \
    NEMOCLAW_HOME="$NEMOCLAW_HOME" \
    bash "$INSTALLER_SCRIPT" || true

  # Reload PATH
  export PATH="$PATH:/usr/local/bin:/root/.local/bin"
  source ~/.bashrc 2>/dev/null || true

  if ! command -v nemoclaw &>/dev/null; then
    echo "[WARN] nemoclaw not in PATH yet — checking common locations..."
    for p in /usr/local/bin/nemoclaw /root/.local/bin/nemoclaw "$NVME/bin/nemoclaw"; do
      if [ -f "$p" ]; then
        ln -sfn "$p" /usr/local/bin/nemoclaw
        echo "[✓] Linked nemoclaw from $p"
        break
      fi
    done
  fi

  echo "[✓] nemoclaw installed"
fi

# ──────────────────────────────────────────────────────────────────
# 4. Onboard the Velocity Sentinel agent sandbox
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[4/7] Onboarding '$AGENT_NAME' NemoClaw sandbox..."

# Check if sandbox already exists
if nemoclaw "$AGENT_NAME" status &>/dev/null; then
  echo "[✓] Sandbox '$AGENT_NAME' already exists — skipping creation"
else
  echo "  Running nemoclaw onboard (this may take a few minutes)..."
  # --provider compatible-endpoint: use our local Ollama instead of NVIDIA cloud
  # --yes: skip confirmation prompts
  nemoclaw onboard \
    --name "$AGENT_NAME" \
    --provider compatible-endpoint \
    --endpoint "$OLLAMA_URL/v1" \
    --model "$OLLAMA_MODEL" \
    --yes \
    --no-messaging-bridge \
    --no-skills || {
      echo "[WARN] Structured onboard failed — trying minimal onboard..."
      # Fallback: let it run with defaults if flags are not supported in this alpha version
      yes "" | nemoclaw onboard --name "$AGENT_NAME" 2>&1 | head -60 || true
    }
  echo "[✓] Sandbox onboarded"
fi

# ──────────────────────────────────────────────────────────────────
# 5. Configure OpenShell to use Ollama (compatible endpoint)
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[5/7] Configuring OpenShell inference → Ollama (qwen3.5:27b)..."

# Set inference route to our local Ollama
openshell inference set \
  --provider compatible-endpoint \
  --base-url "$OLLAMA_URL/v1" \
  --api-key "ollama" \
  --model "$OLLAMA_MODEL" \
  --context-window 32768 \
  --max-tokens 4096 || {
    echo "[WARN] openshell inference set failed — trying alternate syntax..."
    openshell inference set \
      --provider compatible-endpoint \
      --model "$OLLAMA_MODEL" || true
  }

# Also set the context window on the Ollama model side
echo "  Setting Ollama num_ctx=32768..."
curl -s -X POST "$OLLAMA_URL/api/generate" \
  -H "Content-Type: application/json" \
  -d "{\"model\":\"$OLLAMA_MODEL\",\"prompt\":\"\",\"options\":{\"num_ctx\":32768},\"stream\":false}" \
  > /dev/null 2>&1 || true

echo "[✓] OpenShell inference configured → $OLLAMA_URL ($OLLAMA_MODEL)"

# ──────────────────────────────────────────────────────────────────
# 6. Write OpenShell network policy (allow Velocity backend egress)
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[6/7] Writing OpenShell network policy..."

POLICY_DIR="$OPENSHELL_HOME/policy"
mkdir -p "$POLICY_DIR"

cat > "$POLICY_DIR/velocity_egress.yaml" << 'POLICY'
# OpenShell Network Egress Policy — Project Velocity Sentinel
# Applied to the velocity-sentinel sandbox.
# All non-listed hosts are blocked by default.

version: "1"
sandbox: velocity-sentinel

egress:
  # Local Ollama inference (Qwen 3.5 27B)
  - host: "127.0.0.1"
    ports: [11434]
    description: "Ollama LLM inference"
    action: allow

  # OpenShell gateway itself (loopback)
  - host: "127.0.0.1"
    ports: [8080, 8081, 8082, 8083, 8084, 8085]
    description: "OpenShell gateway ports"
    action: allow

  # Velocity FastAPI backend (same host)
  - host: "127.0.0.1"
    ports: [8000, 8001, 8288]
    description: "Velocity FastAPI backend"
    action: allow

  # PostgreSQL (same host)
  - host: "127.0.0.1"
    ports: [5432]
    description: "PostgreSQL DB"
    action: allow

  # Block everything else
  - host: "*"
    action: deny
    description: "Default deny — data sovereignty (India/Abu Dhabi)"
POLICY

# Apply the policy if openshell supports it
openshell policy apply "$POLICY_DIR/velocity_egress.yaml" 2>/dev/null || \
  echo "[WARN] Policy apply not supported yet in this alpha — YAML written for future use"

echo "[✓] Network policy written → $POLICY_DIR/velocity_egress.yaml"

# ──────────────────────────────────────────────────────────────────
# 7. Write NemoClaw systemd service
# ──────────────────────────────────────────────────────────────────
echo ""
echo "[7/7] Installing systemd service: nemoclaw-velocity.service..."

NEMOCLAW_BIN=$(command -v nemoclaw || echo "/usr/local/bin/nemoclaw")
OPENSHELL_BIN=$(command -v openshell || echo "/usr/local/bin/openshell")

cat > /etc/systemd/system/nemoclaw-velocity.service << SERVICE
[Unit]
Description=NemoClaw Velocity Sentinel Gateway
Documentation=https://github.com/NVIDIA/NemoClaw
After=network.target ollama.service docker.service
Wants=ollama.service docker.service

[Service]
Type=simple
User=ubuntu
Group=ubuntu
WorkingDirectory=$NVME/nemoclaw

# GPU constraint: NemoClaw itself is CPU-bound (inference goes to Ollama)
# Ollama already owns GPUs 0,1. ComfyUI owns GPUs 2,3.
Environment=CUDA_VISIBLE_DEVICES=""
Environment=NEMOCLAW_HOME=$NVME/nemoclaw
Environment=OPENSHELL_HOME=$NVME/openshell
Environment=OLLAMA_BASE_URL=http://127.0.0.1:11434
Environment=VELOCITY_NEMO_MODEL=qwen3.5:27b
Environment=GATEWAY_PORT=$OPENCLAW_PORT

ExecStart=$NEMOCLAW_BIN $AGENT_NAME connect --gateway-port $OPENCLAW_PORT
ExecReload=/bin/kill -HUP \$MAINPID
Restart=always
RestartSec=10
StandardOutput=append:$NVME/logs/nemoclaw-velocity.log
StandardError=append:$NVME/logs/nemoclaw-velocity.log

# Limits
LimitNOFILE=65536
TimeoutStopSec=30

[Install]
WantedBy=multi-user.target
SERVICE

mkdir -p "$NVME/logs"
systemctl daemon-reload
systemctl enable nemoclaw-velocity.service
systemctl start nemoclaw-velocity.service || true   # May fail on first boot if onboard not done

echo "[✓] nemoclaw-velocity.service enabled and started"

# ──────────────────────────────────────────────────────────────────
# Finalize: Detect gateway port & write env file
# ──────────────────────────────────────────────────────────────────
echo ""
echo "================================================================"
echo "  Writing Velocity backend environment file..."
echo "================================================================"

VELOCITY_ENV="$NVME/velocity/env"
mkdir -p "$(dirname "$VELOCITY_ENV")"

# Detect actual OpenShell gateway URL
GATEWAY_URL="http://127.0.0.1:$OPENCLAW_PORT"
GATEWAY_CHAT_URL="$GATEWAY_URL/v1/chat/completions"

# Quick connectivity test (will succeed once nemoclaw starts)
echo "  Testing gateway at $GATEWAY_CHAT_URL ..."
sleep 5
HTTP_CODE=$(curl -sf -o /dev/null -w "%{http_code}" \
  -X POST "$GATEWAY_CHAT_URL" \
  -H "Content-Type: application/json" \
  -d '{"model":"qwen3.5:27b","messages":[{"role":"user","content":"ping"}],"max_tokens":5}' \
  2>/dev/null || echo "000")

if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then
  echo "[✓] Gateway responding at $GATEWAY_CHAT_URL (HTTP $HTTP_CODE)"
else
  echo "[WARN] Gateway not yet responding (HTTP $HTTP_CODE) — it may still be starting up"
fi

cat > "$VELOCITY_ENV" << ENV
# Project Velocity — Backend Environment
# Generated by nemoclaw_deploy.sh
# Loaded by: source $VELOCITY_ENV

# ── NemoClaw / OpenShell Gateway ──────────────────────────────────
NEMOCLAW_BASE_URL=$GATEWAY_URL
NEMOCLAW_CHAT_URL=$GATEWAY_CHAT_URL
NEMOCLAW_MODEL=qwen3.5:27b
NEMOCLAW_TIMEOUT_S=30.0
NEMOCLAW_TEMPERATURE=0.2

# ── Ollama (direct fallback if OpenShell gateway not up) ──────────
OLLAMA_BASE_URL=http://127.0.0.1:11434

# ── NemoClaw Prompts ──────────────────────────────────────────────
NEMOCLAW_PROMPT_DIR=$NVME/nemoclaw/prompts

# ── JWT / Auth ────────────────────────────────────────────────────
# VELOCITY_JWT_SECRET=<SET_THIS>

# ── PostgreSQL ────────────────────────────────────────────────────
# VELOCITY_DB_DSN=postgresql://velocity_app:<PW>@127.0.0.1:5432/velocity
ENV

echo "[✓] Environment file written → $VELOCITY_ENV"
echo ""
echo "================================================================"
echo "  DONE. Summary:"
echo ""
echo "  Agent name   : $AGENT_NAME"
echo "  Gateway URL  : $GATEWAY_URL"
echo "  Chat endpoint: $GATEWAY_CHAT_URL"
echo "  Model        : $OLLAMA_MODEL (via Ollama on port 11434)"
echo "  GPUs 0,1     : Ollama (unchanged)"
echo "  GPUs 2,3     : ComfyUI (unchanged)"
echo "  Env file     : $VELOCITY_ENV"
echo "  Service log  : $NVME/logs/nemoclaw-velocity.log"
echo ""
echo "  Next commands to verify:"
echo "    nemoclaw $AGENT_NAME status"
echo "    nemoclaw $AGENT_NAME logs --follow"
echo "    curl $GATEWAY_CHAT_URL (POST with messages[])"
echo "================================================================"