forked from sagnik/Project_Velocity
300 lines
11 KiB
Bash
300 lines
11 KiB
Bash
#!/bin/bash
|
|
# ============================================================
|
|
# Desineuron AWS Node 4x L4 (96GB VRAM) Spot — Bootstrap
|
|
# Instance: g6.12xlarge | 48 vCPU | 192 GiB RAM | 4x L4 GPU
|
|
# NVMe: 4x 940 GB → RAID-0 /data (~3.76 TB)
|
|
# S3 Source: s3://project-velocity/
|
|
# ============================================================
|
|
set -euo pipefail
|
|
exec > /var/log/user-data.log 2>&1
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
|
|
|
log "========================================================"
|
|
log " Desineuron L4 Node Bootstrap Starting"
|
|
log "========================================================"
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 1. System prep
|
|
# ─────────────────────────────────────────────
|
|
log "[1/9] System prep..."
|
|
export DEBIAN_FRONTEND=noninteractive
|
|
apt-get update -qq
|
|
apt-get install -y -qq mdadm unzip git curl wget nvme-cli lsblk util-linux fio
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 2. NVMe RAID-0 — 4x 940 GB → /data
|
|
# ─────────────────────────────────────────────
|
|
log "[2/9] Setting up NVMe RAID-0..."
|
|
|
|
# Wait for all NVMe devices to appear (up to 60s)
|
|
NVME_DEVS=""
|
|
for i in $(seq 1 60); do
|
|
NVME_DEVS=$(lsblk -d -o NAME,TYPE | awk '/nvme/ && /disk/ {print "/dev/"$1}' | sort | grep -v nvme0 || true)
|
|
COUNT=$(echo "$NVME_DEVS" | grep -c nvme || true)
|
|
if [ "$COUNT" -ge 4 ]; then
|
|
log " → Found $COUNT NVMe devices: $NVME_DEVS"
|
|
break
|
|
fi
|
|
log " → Waiting for NVMe devices ($i/60)..."
|
|
sleep 2
|
|
done
|
|
|
|
# Wipe any existing superblocks
|
|
for dev in $NVME_DEVS; do
|
|
mdadm --zero-superblock --force "$dev" 2>/dev/null || true
|
|
wipefs -af "$dev" 2>/dev/null || true
|
|
done
|
|
|
|
# Create RAID-0 across all 4 NVMe drives
|
|
mdadm --create /dev/md0 \
|
|
--level=0 \
|
|
--raid-devices=4 \
|
|
--chunk=512 \
|
|
$NVME_DEVS \
|
|
--force --run
|
|
|
|
# Format with ext4 (large_file, optimised for ML workloads)
|
|
mkfs.ext4 -F \
|
|
-E lazy_itable_init=0,lazy_journal_init=0 \
|
|
-O large_file \
|
|
/dev/md0
|
|
|
|
# Mount
|
|
mkdir -p /data
|
|
mount /dev/md0 /data
|
|
echo "/dev/md0 /data ext4 defaults,nofail,noatime 0 0" >> /etc/fstab
|
|
|
|
DISK_SIZE=$(df -h /data | awk 'NR==2{print $2}')
|
|
log " → /data mounted — size: $DISK_SIZE"
|
|
|
|
# Save mdadm config for persistence
|
|
mdadm --detail --scan >> /etc/mdadm/mdadm.conf
|
|
update-initramfs -u
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 3. Install s5cmd v2.2.2
|
|
# ─────────────────────────────────────────────
|
|
log "[3/9] Installing s5cmd v2.2.2..."
|
|
curl -sL https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz \
|
|
| tar xz -C /usr/local/bin s5cmd
|
|
chmod +x /usr/local/bin/s5cmd
|
|
log " → $(s5cmd version)"
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 4. S3 Sync — full s3://project-velocity/ → /data/
|
|
# ─────────────────────────────────────────────
|
|
log "[4/9] Syncing s3://project-velocity/ → /data/ ..."
|
|
mkdir -p /data/models /data/comfy-engine
|
|
|
|
# Sync models (12.9 GiB — large files, 32 workers, 50MB parts)
|
|
log " → Syncing models/ prefix..."
|
|
s5cmd sync \
|
|
--concurrency 32 \
|
|
--part-size 50 \
|
|
"s3://project-velocity/models/*" \
|
|
/data/models/
|
|
|
|
# Sync comfy-engine (2.3 MiB — comfy_engine.zip)
|
|
log " → Syncing comfy-engine/ prefix..."
|
|
s5cmd sync \
|
|
--concurrency 8 \
|
|
"s3://project-velocity/comfy-engine/*" \
|
|
/data/comfy-engine/
|
|
|
|
log " → S3 sync complete."
|
|
log " → models/ contents:"
|
|
ls -lh /data/models/
|
|
log " → comfy-engine/ contents:"
|
|
ls -lh /data/comfy-engine/
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 5. Extract ComfyUI from comfy_engine.zip
|
|
# ─────────────────────────────────────────────
|
|
log "[5/9] Extracting ComfyUI from comfy_engine.zip..."
|
|
mkdir -p /data/comfy_engine
|
|
unzip -q /data/comfy-engine/comfy_engine.zip -d /data/comfy_engine/
|
|
|
|
# Detect ComfyUI root (the zip may have a top-level folder)
|
|
COMFY_ROOT=""
|
|
if [ -f /data/comfy_engine/main.py ]; then
|
|
COMFY_ROOT="/data/comfy_engine"
|
|
else
|
|
COMFY_ROOT=$(find /data/comfy_engine -name "main.py" -maxdepth 3 | head -1 | xargs dirname)
|
|
fi
|
|
log " → ComfyUI root: $COMFY_ROOT"
|
|
echo "COMFY_ROOT=$COMFY_ROOT" >> /etc/environment
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 6. Create model directories & symlinks
|
|
# ─────────────────────────────────────────────
|
|
log "[6/9] Creating model directory structure and symlinks..."
|
|
mkdir -p \
|
|
"$COMFY_ROOT/models/checkpoints" \
|
|
"$COMFY_ROOT/models/controlnet" \
|
|
"$COMFY_ROOT/models/sams" \
|
|
"$COMFY_ROOT/models/ipadapter" \
|
|
"$COMFY_ROOT/models/vae" \
|
|
"$COMFY_ROOT/models/upscale_models" \
|
|
"$COMFY_ROOT/test_outputs" \
|
|
"$COMFY_ROOT/cache/masks"
|
|
|
|
# Symlink all models from /data/models/ into ComfyUI model paths
|
|
# Checkpoints
|
|
[ -f "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" ] && \
|
|
ln -sf "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" \
|
|
"$COMFY_ROOT/models/checkpoints/"
|
|
|
|
# ControlNet
|
|
for f in control_v11f1p_sd15_depth.pth control_v11p_sd15_canny.pth \
|
|
control_v11p_sd15_seg.pth control_v11p_sd15_mlsd.pth; do
|
|
[ -f "/data/models/$f" ] && \
|
|
ln -sf "/data/models/$f" "$COMFY_ROOT/models/controlnet/" || \
|
|
log " ⚠ ControlNet model not found: $f (can download later)"
|
|
done
|
|
|
|
# SAM
|
|
[ -f "/data/models/sam_vit_h_4b8939.pth" ] && \
|
|
ln -sf "/data/models/sam_vit_h_4b8939.pth" "$COMFY_ROOT/models/sams/"
|
|
|
|
# IPAdapter
|
|
[ -f "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" ] && \
|
|
ln -sf "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" \
|
|
"$COMFY_ROOT/models/ipadapter/"
|
|
|
|
log " → Model symlinks created."
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 7. Python / pip environment
|
|
# ─────────────────────────────────────────────
|
|
log "[7/9] Installing Python dependencies..."
|
|
|
|
# AMI ships with Python 3.10+ and PyTorch 2.7 + CUDA 12.4 in /opt/conda
|
|
# Activate conda base if present
|
|
if [ -f /opt/conda/etc/profile.d/conda.sh ]; then
|
|
source /opt/conda/etc/profile.d/conda.sh
|
|
conda activate base
|
|
PYTHON=python
|
|
else
|
|
PYTHON=python3
|
|
fi
|
|
|
|
# Upgrade pip
|
|
$PYTHON -m pip install --upgrade pip --quiet
|
|
|
|
# Install ComfyUI requirements (if present in extracted zip)
|
|
if [ -f "$COMFY_ROOT/requirements.txt" ]; then
|
|
$PYTHON -m pip install -r "$COMFY_ROOT/requirements.txt" --quiet
|
|
log " → ComfyUI requirements installed"
|
|
fi
|
|
|
|
# Install Dream Weaver automation requirements
|
|
if [ -f "$COMFY_ROOT/comfy_engine/requirements.txt" ]; then
|
|
$PYTHON -m pip install -r "$COMFY_ROOT/comfy_engine/requirements.txt" --quiet
|
|
fi
|
|
|
|
# Core additional packages
|
|
$PYTHON -m pip install \
|
|
xformers \
|
|
numpy>=1.24.0 \
|
|
Pillow>=10.0.0 \
|
|
opencv-python>=4.8.0 \
|
|
requests>=2.31.0 \
|
|
websockets>=11.0.0 \
|
|
aiohttp>=3.8.0 \
|
|
aiofiles>=23.0.0 \
|
|
watchdog>=3.0.0 \
|
|
nvidia-ml-py3 \
|
|
--quiet
|
|
|
|
log " → All Python dependencies installed."
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 8. Clone custom nodes
|
|
# ─────────────────────────────────────────────
|
|
log "[8/9] Installing ComfyUI custom nodes..."
|
|
CUSTOM_NODES="$COMFY_ROOT/custom_nodes"
|
|
mkdir -p "$CUSTOM_NODES"
|
|
cd "$CUSTOM_NODES"
|
|
|
|
REPOS=(
|
|
"https://github.com/Fannovel16/comfyui_controlnet_aux.git"
|
|
"https://github.com/ltdrdata/ComfyUI-Impact-Pack.git"
|
|
"https://github.com/ltdrdata/ComfyUI-Manager.git"
|
|
"https://github.com/WASasquatch/was-node-suite-comfyui.git"
|
|
"https://github.com/Kosinkadink/ComfyUI-Advanced-ControlNet.git"
|
|
"https://github.com/storyicon/comfyui_segment_anything.git"
|
|
"https://github.com/cubiq/ComfyUI_IPAdapter_plus.git"
|
|
)
|
|
|
|
for repo in "${REPOS[@]}"; do
|
|
NAME=$(basename "$repo" .git)
|
|
if [ -d "$NAME" ]; then
|
|
log " → $NAME already exists, pulling..."
|
|
git -C "$NAME" pull --quiet || true
|
|
else
|
|
log " → Cloning $NAME..."
|
|
git clone --quiet --depth 1 "$repo" "$NAME" || log " ⚠ Failed to clone $NAME"
|
|
fi
|
|
# Install node requirements
|
|
if [ -f "$NAME/requirements.txt" ]; then
|
|
$PYTHON -m pip install -r "$NAME/requirements.txt" --quiet 2>/dev/null || true
|
|
fi
|
|
done
|
|
|
|
log " → Custom nodes installed."
|
|
|
|
# ─────────────────────────────────────────────
|
|
# 9. Create ComfyUI systemd service
|
|
# ─────────────────────────────────────────────
|
|
log "[9/9] Creating comfyui systemd service..."
|
|
|
|
# Find python binary
|
|
PYTHON_BIN=$(which $PYTHON)
|
|
|
|
cat > /etc/systemd/system/comfyui.service << EOF
|
|
[Unit]
|
|
Description=ComfyUI — Dream Weaver (Desineuron L4 Node)
|
|
After=network.target
|
|
Wants=network-online.target
|
|
|
|
[Service]
|
|
Type=simple
|
|
User=ubuntu
|
|
WorkingDirectory=${COMFY_ROOT}
|
|
ExecStart=${PYTHON_BIN} ${COMFY_ROOT}/main.py \\
|
|
--port 8000 \\
|
|
--listen 0.0.0.0 \\
|
|
--bf16 \\
|
|
--highvram \\
|
|
--xformers \\
|
|
--output-directory ${COMFY_ROOT}/test_outputs \\
|
|
--input-directory ${COMFY_ROOT}/test_inputs
|
|
Restart=on-failure
|
|
RestartSec=10
|
|
StandardOutput=append:/var/log/comfyui.log
|
|
StandardError=append:/var/log/comfyui.log
|
|
Environment=PYTHONUNBUFFERED=1
|
|
Environment=CUDA_VISIBLE_DEVICES=0,1,2,3
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Ensure ubuntu owns everything
|
|
chown -R ubuntu:ubuntu /data
|
|
chmod 600 /var/log/comfyui.log 2>/dev/null || touch /var/log/comfyui.log
|
|
chown ubuntu:ubuntu /var/log/comfyui.log
|
|
|
|
# Enable and start
|
|
systemctl daemon-reload
|
|
systemctl enable comfyui.service
|
|
systemctl start comfyui.service
|
|
|
|
log "========================================================"
|
|
log " Bootstrap COMPLETE"
|
|
log " ComfyUI listening on 0.0.0.0:8000"
|
|
log " /data size: $(df -h /data | awk 'NR==2{print $2}')"
|
|
log " GPU count: $(nvidia-smi --list-gpus | wc -l)"
|
|
log "========================================================"
|