#!/bin/bash # ============================================================ # Desineuron AWS Node 4x L4 (96GB VRAM) Spot — Bootstrap # Instance: g6.12xlarge | 48 vCPU | 192 GiB RAM | 4x L4 GPU # NVMe: 4x 940 GB → RAID-0 /data (~3.76 TB) # S3 Source: s3://project-velocity/ # ============================================================ set -euo pipefail exec > /var/log/user-data.log 2>&1 log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } log "========================================================" log " Desineuron L4 Node Bootstrap Starting" log "========================================================" # ───────────────────────────────────────────── # 1. System prep # ───────────────────────────────────────────── log "[1/9] System prep..." export DEBIAN_FRONTEND=noninteractive apt-get update -qq apt-get install -y -qq mdadm unzip git curl wget nvme-cli lsblk util-linux fio # ───────────────────────────────────────────── # 2. NVMe RAID-0 — 4x 940 GB → /data # ───────────────────────────────────────────── log "[2/9] Setting up NVMe RAID-0..." # Wait for all NVMe devices to appear (up to 60s) NVME_DEVS="" for i in $(seq 1 60); do NVME_DEVS=$(lsblk -d -o NAME,TYPE | awk '/nvme/ && /disk/ {print "/dev/"$1}' | sort | grep -v nvme0 || true) COUNT=$(echo "$NVME_DEVS" | grep -c nvme || true) if [ "$COUNT" -ge 4 ]; then log " → Found $COUNT NVMe devices: $NVME_DEVS" break fi log " → Waiting for NVMe devices ($i/60)..." sleep 2 done # Wipe any existing superblocks for dev in $NVME_DEVS; do mdadm --zero-superblock --force "$dev" 2>/dev/null || true wipefs -af "$dev" 2>/dev/null || true done # Create RAID-0 across all 4 NVMe drives mdadm --create /dev/md0 \ --level=0 \ --raid-devices=4 \ --chunk=512 \ $NVME_DEVS \ --force --run # Format with ext4 (large_file, optimised for ML workloads) mkfs.ext4 -F \ -E lazy_itable_init=0,lazy_journal_init=0 \ -O large_file \ /dev/md0 # Mount mkdir -p /data mount /dev/md0 /data echo "/dev/md0 /data ext4 defaults,nofail,noatime 0 0" >> /etc/fstab DISK_SIZE=$(df -h /data | awk 'NR==2{print $2}') log " → /data mounted — size: $DISK_SIZE" # Save mdadm config for persistence mdadm --detail --scan >> /etc/mdadm/mdadm.conf update-initramfs -u # ───────────────────────────────────────────── # 3. Install s5cmd v2.2.2 # ───────────────────────────────────────────── log "[3/9] Installing s5cmd v2.2.2..." curl -sL https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz \ | tar xz -C /usr/local/bin s5cmd chmod +x /usr/local/bin/s5cmd log " → $(s5cmd version)" # ───────────────────────────────────────────── # 4. S3 Sync — full s3://project-velocity/ → /data/ # ───────────────────────────────────────────── log "[4/9] Syncing s3://project-velocity/ → /data/ ..." mkdir -p /data/models /data/comfy-engine # Sync models (12.9 GiB — large files, 32 workers, 50MB parts) log " → Syncing models/ prefix..." s5cmd sync \ --concurrency 32 \ --part-size 50 \ "s3://project-velocity/models/*" \ /data/models/ # Sync comfy-engine (2.3 MiB — comfy_engine.zip) log " → Syncing comfy-engine/ prefix..." s5cmd sync \ --concurrency 8 \ "s3://project-velocity/comfy-engine/*" \ /data/comfy-engine/ log " → S3 sync complete." log " → models/ contents:" ls -lh /data/models/ log " → comfy-engine/ contents:" ls -lh /data/comfy-engine/ # ───────────────────────────────────────────── # 5. Extract ComfyUI from comfy_engine.zip # ───────────────────────────────────────────── log "[5/9] Extracting ComfyUI from comfy_engine.zip..." mkdir -p /data/comfy_engine unzip -q /data/comfy-engine/comfy_engine.zip -d /data/comfy_engine/ # Detect ComfyUI root (the zip may have a top-level folder) COMFY_ROOT="" if [ -f /data/comfy_engine/main.py ]; then COMFY_ROOT="/data/comfy_engine" else COMFY_ROOT=$(find /data/comfy_engine -name "main.py" -maxdepth 3 | head -1 | xargs dirname) fi log " → ComfyUI root: $COMFY_ROOT" echo "COMFY_ROOT=$COMFY_ROOT" >> /etc/environment # ───────────────────────────────────────────── # 6. Create model directories & symlinks # ───────────────────────────────────────────── log "[6/9] Creating model directory structure and symlinks..." mkdir -p \ "$COMFY_ROOT/models/checkpoints" \ "$COMFY_ROOT/models/controlnet" \ "$COMFY_ROOT/models/sams" \ "$COMFY_ROOT/models/ipadapter" \ "$COMFY_ROOT/models/vae" \ "$COMFY_ROOT/models/upscale_models" \ "$COMFY_ROOT/test_outputs" \ "$COMFY_ROOT/cache/masks" # Symlink all models from /data/models/ into ComfyUI model paths # Checkpoints [ -f "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" ] && \ ln -sf "/data/models/realvisxlV50_v50LightningBakedvae.safetensors" \ "$COMFY_ROOT/models/checkpoints/" # ControlNet for f in control_v11f1p_sd15_depth.pth control_v11p_sd15_canny.pth \ control_v11p_sd15_seg.pth control_v11p_sd15_mlsd.pth; do [ -f "/data/models/$f" ] && \ ln -sf "/data/models/$f" "$COMFY_ROOT/models/controlnet/" || \ log " ⚠ ControlNet model not found: $f (can download later)" done # SAM [ -f "/data/models/sam_vit_h_4b8939.pth" ] && \ ln -sf "/data/models/sam_vit_h_4b8939.pth" "$COMFY_ROOT/models/sams/" # IPAdapter [ -f "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" ] && \ ln -sf "/data/models/ip-adapter-faceid-plusv2_sdxl.bin" \ "$COMFY_ROOT/models/ipadapter/" log " → Model symlinks created." # ───────────────────────────────────────────── # 7. Python / pip environment # ───────────────────────────────────────────── log "[7/9] Installing Python dependencies..." # AMI ships with Python 3.10+ and PyTorch 2.7 + CUDA 12.4 in /opt/conda # Activate conda base if present if [ -f /opt/conda/etc/profile.d/conda.sh ]; then source /opt/conda/etc/profile.d/conda.sh conda activate base PYTHON=python else PYTHON=python3 fi # Upgrade pip $PYTHON -m pip install --upgrade pip --quiet # Install ComfyUI requirements (if present in extracted zip) if [ -f "$COMFY_ROOT/requirements.txt" ]; then $PYTHON -m pip install -r "$COMFY_ROOT/requirements.txt" --quiet log " → ComfyUI requirements installed" fi # Install Dream Weaver automation requirements if [ -f "$COMFY_ROOT/comfy_engine/requirements.txt" ]; then $PYTHON -m pip install -r "$COMFY_ROOT/comfy_engine/requirements.txt" --quiet fi # Core additional packages $PYTHON -m pip install \ xformers \ numpy>=1.24.0 \ Pillow>=10.0.0 \ opencv-python>=4.8.0 \ requests>=2.31.0 \ websockets>=11.0.0 \ aiohttp>=3.8.0 \ aiofiles>=23.0.0 \ watchdog>=3.0.0 \ nvidia-ml-py3 \ --quiet log " → All Python dependencies installed." # ───────────────────────────────────────────── # 8. Clone custom nodes # ───────────────────────────────────────────── log "[8/9] Installing ComfyUI custom nodes..." CUSTOM_NODES="$COMFY_ROOT/custom_nodes" mkdir -p "$CUSTOM_NODES" cd "$CUSTOM_NODES" REPOS=( "https://github.com/Fannovel16/comfyui_controlnet_aux.git" "https://github.com/ltdrdata/ComfyUI-Impact-Pack.git" "https://github.com/ltdrdata/ComfyUI-Manager.git" "https://github.com/WASasquatch/was-node-suite-comfyui.git" "https://github.com/Kosinkadink/ComfyUI-Advanced-ControlNet.git" "https://github.com/storyicon/comfyui_segment_anything.git" "https://github.com/cubiq/ComfyUI_IPAdapter_plus.git" ) for repo in "${REPOS[@]}"; do NAME=$(basename "$repo" .git) if [ -d "$NAME" ]; then log " → $NAME already exists, pulling..." git -C "$NAME" pull --quiet || true else log " → Cloning $NAME..." git clone --quiet --depth 1 "$repo" "$NAME" || log " ⚠ Failed to clone $NAME" fi # Install node requirements if [ -f "$NAME/requirements.txt" ]; then $PYTHON -m pip install -r "$NAME/requirements.txt" --quiet 2>/dev/null || true fi done log " → Custom nodes installed." # ───────────────────────────────────────────── # 9. Create ComfyUI systemd service # ───────────────────────────────────────────── log "[9/9] Creating comfyui systemd service..." # Find python binary PYTHON_BIN=$(which $PYTHON) cat > /etc/systemd/system/comfyui.service << EOF [Unit] Description=ComfyUI — Dream Weaver (Desineuron L4 Node) After=network.target Wants=network-online.target [Service] Type=simple User=ubuntu WorkingDirectory=${COMFY_ROOT} ExecStart=${PYTHON_BIN} ${COMFY_ROOT}/main.py \\ --port 8000 \\ --listen 0.0.0.0 \\ --bf16 \\ --highvram \\ --xformers \\ --output-directory ${COMFY_ROOT}/test_outputs \\ --input-directory ${COMFY_ROOT}/test_inputs Restart=on-failure RestartSec=10 StandardOutput=append:/var/log/comfyui.log StandardError=append:/var/log/comfyui.log Environment=PYTHONUNBUFFERED=1 Environment=CUDA_VISIBLE_DEVICES=0,1,2,3 [Install] WantedBy=multi-user.target EOF # Ensure ubuntu owns everything chown -R ubuntu:ubuntu /data chmod 600 /var/log/comfyui.log 2>/dev/null || touch /var/log/comfyui.log chown ubuntu:ubuntu /var/log/comfyui.log # Enable and start systemctl daemon-reload systemctl enable comfyui.service systemctl start comfyui.service log "========================================================" log " Bootstrap COMPLETE" log " ComfyUI listening on 0.0.0.0:8000" log " /data size: $(df -h /data | awk 'NR==2{print $2}')" log " GPU count: $(nvidia-smi --list-gpus | wc -l)" log "========================================================"