Files
Project_Velocity/user_data_bootstrap.sh
2026-04-12 02:02:58 +05:30

300 lines
11 KiB
Bash

#!/bin/bash
# ============================================================
# Desineuron AWS Node 4x L4 (96GB VRAM) Spot — Bootstrap
# Instance: g6.12xlarge | 48 vCPU | 192 GiB RAM | 4x L4 GPU
# NVMe: 4x 940 GB → RAID-0 /data (~3.76 TB)
# S3 Source: s3://project-velocity/
# ============================================================
set -euo pipefail
exec > /var/log/user-data.log 2>&1
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
log "========================================================"
log " Desineuron L4 Node Bootstrap Starting"
log "========================================================"
# ─────────────────────────────────────────────
# 1. System prep
# ─────────────────────────────────────────────
log "[1/9] System prep..."
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq
apt-get install -y -qq mdadm unzip git curl wget nvme-cli lsblk util-linux fio
# ─────────────────────────────────────────────
# 2. NVMe RAID-0 — 4x 940 GB → /data
# ─────────────────────────────────────────────
log "[2/9] Setting up NVMe RAID-0..."
# Wait for all NVMe devices to appear (up to 60s)
NVME_DEVS=""
for i in $(seq 1 60); do
NVME_DEVS=$(lsblk -d -o NAME,TYPE | awk '/nvme/ && /disk/ {print "/dev/"$1}' | sort | grep -v nvme0 || true)
COUNT=$(echo "$NVME_DEVS" | grep -c nvme || true)
if [ "$COUNT" -ge 4 ]; then
log " → Found $COUNT NVMe devices: $NVME_DEVS"
break
fi
log " → Waiting for NVMe devices ($i/60)..."
sleep 2
done
# Wipe any existing superblocks
for dev in $NVME_DEVS; do
mdadm --zero-superblock --force "$dev" 2>/dev/null || true
wipefs -af "$dev" 2>/dev/null || true
done
# Create RAID-0 across all 4 NVMe drives
mdadm --create /dev/md0 \
--level=0 \
--raid-devices=4 \
--chunk=512 \
$NVME_DEVS \
--force --run
# Format with ext4 (large_file, optimised for ML workloads)
mkfs.ext4 -F \
-E lazy_itable_init=0,lazy_journal_init=0 \
-O large_file \
/dev/md0
# Mount
mkdir -p /data
mount /dev/md0 /data
echo "/dev/md0 /data ext4 defaults,nofail,noatime 0 0" >> /etc/fstab
DISK_SIZE=$(df -h /data | awk 'NR==2{print $2}')
log " → /data mounted — size: $DISK_SIZE"
# Save mdadm config for persistence
mdadm --detail --scan >> /etc/mdadm/mdadm.conf
update-initramfs -u
# ─────────────────────────────────────────────
# 3. Install s5cmd v2.2.2
# ─────────────────────────────────────────────
log "[3/9] Installing s5cmd v2.2.2..."
curl -sL https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz \
| tar xz -C /usr/local/bin s5cmd
chmod +x /usr/local/bin/s5cmd
log "$(s5cmd version)"
# ─────────────────────────────────────────────
# 4. S3 Sync — full s3://project-velocity/ → /data/
# ─────────────────────────────────────────────
log "[4/9] Syncing s3://project-velocity/ → /data/ ..."
mkdir -p /data/models /data/comfy-engine
# Sync models (12.9 GiB — large files, 32 workers, 50MB parts)
log " → Syncing models/ prefix..."
s5cmd sync \
--concurrency 32 \
--part-size 50 \
"s3://project-velocity/models/*" \
/data/models/
# Sync comfy-engine (2.3 MiB — comfy_engine.zip)
log " → Syncing comfy-engine/ prefix..."
s5cmd sync \
--concurrency 8 \
"s3://project-velocity/comfy-engine/*" \
/data/comfy-engine/
log " → S3 sync complete."
log " → models/ contents:"
ls -lh /data/models/
log " → comfy-engine/ contents:"
ls -lh /data/comfy-engine/
# ─────────────────────────────────────────────
# 5. Extract ComfyUI from comfy_engine.zip
# ─────────────────────────────────────────────
log "[5/9] Extracting ComfyUI from comfy_engine.zip..."
mkdir -p /data/comfy_engine
unzip -q /data/comfy-engine/comfy_engine.zip -d /data/comfy_engine/
# Detect ComfyUI root (the zip may have a top-level folder)
COMFY_ROOT=""
if [ -f /data/comfy_engine/main.py ]; then
COMFY_ROOT="/data/comfy_engine"
else
COMFY_ROOT=$(find /data/comfy_engine -name "main.py" -maxdepth 3 | head -1 | xargs dirname)
fi
log " → ComfyUI root: $COMFY_ROOT"
echo "COMFY_ROOT=$COMFY_ROOT" >> /etc/environment
# ─────────────────────────────────────────────
# 6. Create model directories & symlinks
# ─────────────────────────────────────────────
log "[6/9] Creating model directory structure and symlinks..."
mkdir -p \
"$COMFY_ROOT/models/checkpoints" \
"$COMFY_ROOT/models/controlnet" \
"$COMFY_ROOT/models/sams" \
"$COMFY_ROOT/models/ipadapter" \
"$COMFY_ROOT/models/vae" \
"$COMFY_ROOT/models/upscale_models" \
"$COMFY_ROOT/test_outputs" \
"$COMFY_ROOT/cache/masks"
# Symlink all models from /data/models/ into ComfyUI model paths
# Checkpoints
[ -f "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" ] && \
ln -sf "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" \
"$COMFY_ROOT/models/checkpoints/"
# ControlNet
for f in control_v11f1p_sd15_depth.pth control_v11p_sd15_canny.pth \
control_v11p_sd15_seg.pth control_v11p_sd15_mlsd.pth; do
[ -f "/data/models/$f" ] && \
ln -sf "/data/models/$f" "$COMFY_ROOT/models/controlnet/" || \
log " ⚠ ControlNet model not found: $f (can download later)"
done
# SAM
[ -f "/data/models/sam_vit_h_4b8939.pth" ] && \
ln -sf "/data/models/sam_vit_h_4b8939.pth" "$COMFY_ROOT/models/sams/"
# IPAdapter
[ -f "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" ] && \
ln -sf "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" \
"$COMFY_ROOT/models/ipadapter/"
log " → Model symlinks created."
# ─────────────────────────────────────────────
# 7. Python / pip environment
# ─────────────────────────────────────────────
log "[7/9] Installing Python dependencies..."
# AMI ships with Python 3.10+ and PyTorch 2.7 + CUDA 12.4 in /opt/conda
# Activate conda base if present
if [ -f /opt/conda/etc/profile.d/conda.sh ]; then
source /opt/conda/etc/profile.d/conda.sh
conda activate base
PYTHON=python
else
PYTHON=python3
fi
# Upgrade pip
$PYTHON -m pip install --upgrade pip --quiet
# Install ComfyUI requirements (if present in extracted zip)
if [ -f "$COMFY_ROOT/requirements.txt" ]; then
$PYTHON -m pip install -r "$COMFY_ROOT/requirements.txt" --quiet
log " → ComfyUI requirements installed"
fi
# Install Dream Weaver automation requirements
if [ -f "$COMFY_ROOT/comfy_engine/requirements.txt" ]; then
$PYTHON -m pip install -r "$COMFY_ROOT/comfy_engine/requirements.txt" --quiet
fi
# Core additional packages
$PYTHON -m pip install \
xformers \
numpy>=1.24.0 \
Pillow>=10.0.0 \
opencv-python>=4.8.0 \
requests>=2.31.0 \
websockets>=11.0.0 \
aiohttp>=3.8.0 \
aiofiles>=23.0.0 \
watchdog>=3.0.0 \
nvidia-ml-py3 \
--quiet
log " → All Python dependencies installed."
# ─────────────────────────────────────────────
# 8. Clone custom nodes
# ─────────────────────────────────────────────
log "[8/9] Installing ComfyUI custom nodes..."
CUSTOM_NODES="$COMFY_ROOT/custom_nodes"
mkdir -p "$CUSTOM_NODES"
cd "$CUSTOM_NODES"
REPOS=(
"https://github.com/Fannovel16/comfyui_controlnet_aux.git"
"https://github.com/ltdrdata/ComfyUI-Impact-Pack.git"
"https://github.com/ltdrdata/ComfyUI-Manager.git"
"https://github.com/WASasquatch/was-node-suite-comfyui.git"
"https://github.com/Kosinkadink/ComfyUI-Advanced-ControlNet.git"
"https://github.com/storyicon/comfyui_segment_anything.git"
"https://github.com/cubiq/ComfyUI_IPAdapter_plus.git"
)
for repo in "${REPOS[@]}"; do
NAME=$(basename "$repo" .git)
if [ -d "$NAME" ]; then
log "$NAME already exists, pulling..."
git -C "$NAME" pull --quiet || true
else
log " → Cloning $NAME..."
git clone --quiet --depth 1 "$repo" "$NAME" || log " ⚠ Failed to clone $NAME"
fi
# Install node requirements
if [ -f "$NAME/requirements.txt" ]; then
$PYTHON -m pip install -r "$NAME/requirements.txt" --quiet 2>/dev/null || true
fi
done
log " → Custom nodes installed."
# ─────────────────────────────────────────────
# 9. Create ComfyUI systemd service
# ─────────────────────────────────────────────
log "[9/9] Creating comfyui systemd service..."
# Find python binary
PYTHON_BIN=$(which $PYTHON)
cat > /etc/systemd/system/comfyui.service << EOF
[Unit]
Description=ComfyUI — Dream Weaver (Desineuron L4 Node)
After=network.target
Wants=network-online.target
[Service]
Type=simple
User=ubuntu
WorkingDirectory=${COMFY_ROOT}
ExecStart=${PYTHON_BIN} ${COMFY_ROOT}/main.py \\
--port 8000 \\
--listen 0.0.0.0 \\
--bf16 \\
--highvram \\
--xformers \\
--output-directory ${COMFY_ROOT}/test_outputs \\
--input-directory ${COMFY_ROOT}/test_inputs
Restart=on-failure
RestartSec=10
StandardOutput=append:/var/log/comfyui.log
StandardError=append:/var/log/comfyui.log
Environment=PYTHONUNBUFFERED=1
Environment=CUDA_VISIBLE_DEVICES=0,1,2,3
[Install]
WantedBy=multi-user.target
EOF
# Ensure ubuntu owns everything
chown -R ubuntu:ubuntu /data
chmod 600 /var/log/comfyui.log 2>/dev/null || touch /var/log/comfyui.log
chown ubuntu:ubuntu /var/log/comfyui.log
# Enable and start
systemctl daemon-reload
systemctl enable comfyui.service
systemctl start comfyui.service
log "========================================================"
log " Bootstrap COMPLETE"
log " ComfyUI listening on 0.0.0.0:8000"
log " /data size: $(df -h /data | awk 'NR==2{print $2}')"
log " GPU count: $(nvidia-smi --list-gpus | wc -l)"
log "========================================================"