Initial commit: Velocity-OS migration

This commit is contained in:
2026-05-01 12:32:19 +05:30
commit 407af828d4
283 changed files with 207782 additions and 0 deletions

View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
DEFAULT_CHECKPOINTS = {
"realvisxlV50_v50LightningBakedvae.safetensors": (
"s3://project-velocity/models/realvisxlV50_v50LightningBakedvae.safetensors"
),
}
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("COMFY_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
else:
tag_key = env("COMFY_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("COMFY_INSTANCE_TAG_VALUE", "comfyui")
reservations = ec2.describe_instances(
Filters=[
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
)["Reservations"]
instances = [
instance
for reservation in reservations
for instance in reservation["Instances"]
if instance["State"]["Name"] == "running"
]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def parse_checkpoints() -> dict[str, str]:
raw = env("COMFY_CHECKPOINTS_JSON")
if not raw:
return dict(DEFAULT_CHECKPOINTS)
parsed = json.loads(raw)
if not isinstance(parsed, dict):
raise ValueError("COMFY_CHECKPOINTS_JSON must be a JSON object of filename to source URI")
return {str(name): str(source) for name, source in parsed.items()}
def remote_hydration_script(checkpoints: dict[str, str]) -> str:
payload = json.dumps(checkpoints)
return f"""#!/usr/bin/env bash
set -euo pipefail
CHECKPOINT_DIR="${{COMFY_CHECKPOINT_DIR:-/opt/dlami/nvme/ComfyUI/models/checkpoints}}"
mkdir -p "$CHECKPOINT_DIR"
if ! mountpoint -q /opt/dlami/nvme; then
echo "GPU NVMe mount /opt/dlami/nvme is not mounted" >&2
exit 2
fi
changed=0
python3 - <<'PY' > /tmp/desineuron-comfy-checkpoints.tsv
import json
for name, source in json.loads({payload!r}).items():
print(f"{{name}}\\t{{source}}")
PY
while IFS=$'\\t' read -r filename source; do
target="$CHECKPOINT_DIR/$filename"
if [ ! -s "$target" ]; then
tmp="$target.part"
rm -f "$tmp"
aws s3 cp "$source" "$tmp" --no-progress
mv "$tmp" "$target"
chmod 0644 "$target"
changed=1
fi
done < /tmp/desineuron-comfy-checkpoints.tsv
rm -f /tmp/desineuron-comfy-checkpoints.tsv
if [ "$changed" = "1" ]; then
sudo systemctl restart comfyui
fi
sleep 3
curl -fsS http://127.0.0.1:8188/models/checkpoints
"""
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
key_path = env(
"GPU_SSH_KEY_PATH",
ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem"),
)
if key_path.startswith("/app/state/"):
key_path = key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
ssh_user = env("GPU_SSH_USER", "ubuntu")
ec2 = boto3.client("ec2", region_name=os.environ["AWS_DEFAULT_REGION"])
instance = resolve_target_instance(ec2)
if not instance:
print("No running ComfyUI GPU instance found", file=sys.stderr)
return 1
target_host = instance.get("PublicIpAddress") or instance.get("PrivateIpAddress")
if not target_host:
print("Target GPU instance has no reachable IP", file=sys.stderr)
return 1
checkpoints = parse_checkpoints()
command = [
"sudo",
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=15",
"-i",
key_path,
f"{ssh_user}@{target_host}",
"bash -s",
]
result = subprocess.run(
command,
input=remote_hydration_script(checkpoints),
text=True,
capture_output=True,
check=False,
)
if result.stdout:
print(result.stdout.strip())
if result.returncode != 0:
if result.stderr:
print(result.stderr.strip(), file=sys.stderr)
return result.returncode
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# ============================================================
# Velocity-OS — Model Hydration Script
# Uses s5cmd for high-throughput parallel S3 → NVMe sync.
# Run once at initial install; safe to re-run for updates.
#
# Models synced (all stored on NVMe, never in Docker images):
# - Wan 2.2 (ComfyUI video/image model)
# - Qwen-Image 2512 (ComfyUI poster/image model)
# - Qwen3.6 35B A3B (SGLang LLM — MIG slice 0)
#
# Requires: s5cmd, AWS credentials with S3 read access
# ============================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────
S3_BUCKET="${S3_MODEL_BUCKET:-s3://desineuron-models}"
LOCAL_BASE="/opt/dlami/nvme/models"
S5CMD_CONCURRENCY="${S5CMD_CONCURRENCY:-32}" # Tune to NVMe write IOPS
# ── Ensure directories exist ─────────────────────────────────
mkdir -p \
"${LOCAL_BASE}/comfy/wan2.2" \
"${LOCAL_BASE}/comfy/qwen-image-2512" \
"${LOCAL_BASE}/llm/qwen3.6-35b-a3b"
# ── Check s5cmd installed ────────────────────────────────────
if ! command -v s5cmd &> /dev/null; then
echo "ERROR: s5cmd not found. Install from https://github.com/peak/s5cmd"
echo " curl -L https://github.com/peak/s5cmd/releases/latest/download/s5cmd_Linux_x86_64.tar.gz | tar xz -C /usr/local/bin"
exit 1
fi
echo "=== Velocity-OS Model Hydration ==="
echo "Source: ${S3_BUCKET}"
echo "Target: ${LOCAL_BASE}"
echo "s5cmd workers: ${S5CMD_CONCURRENCY}"
echo ""
# ── Wan 2.2 (ComfyUI — MIG slice 1) ──────────────────────────
echo "[1/3] Syncing Wan 2.2..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/wan2.2/*" \
"${LOCAL_BASE}/comfy/wan2.2/"
echo " ✓ Wan 2.2 synced."
# ── Qwen-Image 2512 (ComfyUI — MIG slice 1) ──────────────────
echo "[2/3] Syncing Qwen-Image 2512..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/qwen-image-2512/*" \
"${LOCAL_BASE}/comfy/qwen-image-2512/"
echo " ✓ Qwen-Image 2512 synced."
# ── Qwen3.6 35B A3B (SGLang — MIG slice 0) ───────────────────
echo "[3/3] Syncing Qwen3.6 35B A3B (LLM — ~70GB, be patient)..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/qwen3.6-35b-a3b/*" \
"${LOCAL_BASE}/llm/qwen3.6-35b-a3b/"
echo " ✓ Qwen3.6 35B synced."
# ── Verify checksums (optional — if .sha256 files exist in S3) ─
echo ""
echo "=== Verifying checksums ==="
for dir in "${LOCAL_BASE}/comfy/wan2.2" "${LOCAL_BASE}/comfy/qwen-image-2512" "${LOCAL_BASE}/llm/qwen3.6-35b-a3b"; do
if ls "${dir}"/*.sha256 2>/dev/null | head -1 | grep -q sha256; then
echo " Checking ${dir}..."
(cd "${dir}" && sha256sum -c ./*.sha256 --quiet) && echo "${dir} checksums OK"
fi
done
echo ""
echo "=== Model hydration complete ==="
echo "NVMe usage:"
du -sh "${LOCAL_BASE}"/*/* 2>/dev/null || true

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STOP_SGLANG_FOR_COMFY_POOL="${STOP_SGLANG_FOR_COMFY_POOL:-0}"
WORKERS="${COMFY_WORKER_COUNT:-4}"
if [[ ! -d /opt/dlami/nvme/ComfyUI ]]; then
echo "Missing ComfyUI at /opt/dlami/nvme/ComfyUI" >&2
exit 1
fi
if ! mountpoint -q /opt/dlami/nvme; then
echo "/opt/dlami/nvme is not mounted; refusing to run model workers on root disk" >&2
exit 1
fi
if [[ "$STOP_SGLANG_FOR_COMFY_POOL" == "1" ]]; then
sudo systemctl stop desineuron-sglang.service || true
fi
sudo systemctl stop comfyui.service || true
sudo systemctl disable comfyui.service || true
sudo install -m 0755 "$SCRIPT_DIR/desineuron-start-comfy-worker" /usr/local/bin/desineuron-start-comfy-worker
sudo install -m 0644 "$SCRIPT_DIR/comfyui-worker@.service" /etc/systemd/system/comfyui-worker@.service
sudo systemctl daemon-reload
for index in $(seq 0 "$((WORKERS - 1))"); do
sudo systemctl enable --now "comfyui-worker@${index}.service"
sudo systemctl restart "comfyui-worker@${index}.service"
done
sleep 5
for index in $(seq 0 "$((WORKERS - 1))"); do
port=$((8188 + index))
echo "worker ${index} http://127.0.0.1:${port}"
curl -fsS "http://127.0.0.1:${port}/models/checkpoints" | head -c 500
echo
done

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -euo pipefail
NVME_ROOT="${NVME_ROOT:-/opt/dlami/nvme/sglang}"
RUNTIME_ROOT="${RUNTIME_ROOT:-/opt/desineuron-sglang}"
VENV_PATH="${RUNTIME_ROOT}/.venv"
PORT="${SGLANG_PORT:-30100}"
HOST="${SGLANG_HOST:-}"
MODEL_ID="${SGLANG_MODEL_ID:-qwen3.6-35b-a3b}"
MODEL_PATH="${SGLANG_MODEL_PATH:-/opt/dlami/nvme/models/Qwen-Qwen3.6-35B-A3B-FP8}"
TP_SIZE="${SGLANG_TP_SIZE:-4}"
CONTEXT_LENGTH="${SGLANG_CONTEXT_LENGTH:-131072}"
MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_STATIC:-0.88}"
ATTENTION_BACKEND="${SGLANG_ATTENTION_BACKEND:-flashinfer}"
DIST_INIT_ADDR="${SGLANG_DIST_INIT_ADDR:-127.0.0.1:50000}"
if [[ -z "${HOST}" ]]; then
IMDS_TOKEN="$(curl -fsS -X PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600' || true)"
if [[ -n "${IMDS_TOKEN}" ]]; then
HOST="$(curl -fsS -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/local-ipv4 || true)"
fi
fi
if [[ -z "${HOST}" ]]; then
HOST="$(hostname -I | awk '{print $1}')"
fi
if [[ -z "${HOST}" ]]; then
echo "Unable to resolve GPU private IP for SGLang host binding" >&2
exit 1
fi
sudo mkdir -p "${NVME_ROOT}"/{cache,logs,state} "${RUNTIME_ROOT}"
python3 -m venv "${VENV_PATH}"
"${VENV_PATH}/bin/pip" install --upgrade pip wheel setuptools
"${VENV_PATH}/bin/pip" install "sglang[all]>=0.5.3" flashinfer-python huggingface_hub
sudo tee /etc/default/desineuron-sglang >/dev/null <<EOF
SGLANG_HOST=${HOST}
SGLANG_PORT=${PORT}
SGLANG_MODEL_ID=${MODEL_ID}
SGLANG_MODEL_PATH=${MODEL_PATH}
SGLANG_TP_SIZE=${TP_SIZE}
SGLANG_CONTEXT_LENGTH=${CONTEXT_LENGTH}
SGLANG_MEM_FRACTION_STATIC=${MEM_FRACTION_STATIC}
SGLANG_ATTENTION_BACKEND=${ATTENTION_BACKEND}
SGLANG_DIST_INIT_ADDR=${DIST_INIT_ADDR}
SGLANG_CACHE_DIR=${NVME_ROOT}/cache
SGLANG_LOG_DIR=${NVME_ROOT}/logs
SGLANG_STATE_DIR=${NVME_ROOT}/state
SGLANG_USE_FLASHINFER=1
SGLANG_ENABLE_PREFIX_CACHE=1
SGLANG_SERVED_MODEL_NAME=${MODEL_ID}
SGLANG_EXTRA_ARGS=
EOF
sudo chmod 600 /etc/default/desineuron-sglang
sudo tee /usr/local/bin/desineuron-sglang-launch.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
source /etc/default/desineuron-sglang
export HF_HOME="${SGLANG_CACHE_DIR}/hf"
export HUGGINGFACE_HUB_CACHE="${SGLANG_CACHE_DIR}/hf"
export CUDA_DEVICE_MAX_CONNECTIONS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export SGLANG_USE_FLASHINFER="${SGLANG_USE_FLASHINFER}"
exec /opt/desineuron-sglang/.venv/bin/sglang serve \
--host "${SGLANG_HOST}" \
--port "${SGLANG_PORT}" \
--model-path "${SGLANG_MODEL_PATH}" \
--served-model-name "${SGLANG_SERVED_MODEL_NAME}" \
--tp-size "${SGLANG_TP_SIZE}" \
--context-length "${SGLANG_CONTEXT_LENGTH}" \
--mem-fraction-static "${SGLANG_MEM_FRACTION_STATIC}" \
--attention-backend "${SGLANG_ATTENTION_BACKEND}" \
--dist-init-addr "${SGLANG_DIST_INIT_ADDR}" \
--enable-metrics \
--skip-server-warmup \
${SGLANG_EXTRA_ARGS}
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-launch.sh
sudo tee /etc/systemd/system/desineuron-sglang.service >/dev/null <<EOF
[Unit]
Description=Desineuron SGLang Runtime
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=/etc/default/desineuron-sglang
WorkingDirectory=${RUNTIME_ROOT}
ExecStart=/usr/local/bin/desineuron-sglang-launch.sh
Restart=always
RestartSec=5
LimitNOFILE=1048576
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-sglang.service
sudo systemctl --no-pager --full status desineuron-sglang.service

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env bash
set -euo pipefail
sudo tee /usr/local/bin/desineuron-sglang-watchdog.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
source /etc/default/desineuron-sglang
HEALTH_URL="http://127.0.0.1:${SGLANG_PORT}/v1/models"
HYDRATE_HELPER="/usr/local/bin/desineuron-sglang-hydrate.sh"
STARTUP_GRACE_SECONDS="${SGLANG_STARTUP_GRACE_SECONDS:-900}"
HEALTH_TIMEOUT_SECONDS="${SGLANG_HEALTH_TIMEOUT_SECONDS:-60}"
if [[ ! -d "${SGLANG_MODEL_PATH}" ]]; then
"${HYDRATE_HELPER}" "${SGLANG_MODEL_ID}" "${SGLANG_MODEL_PATH}"
fi
if ! systemctl is-active --quiet desineuron-sglang.service; then
systemctl restart desineuron-sglang.service
sleep 10
fi
main_pid="$(systemctl show -p MainPID --value desineuron-sglang.service || true)"
if [[ -n "${main_pid}" && "${main_pid}" != "0" ]]; then
runtime_age="$(( $(date +%s) - $(stat -c %Y "/proc/${main_pid}" 2>/dev/null || date +%s) ))"
if (( runtime_age < STARTUP_GRACE_SECONDS )); then
echo "startup_grace"
exit 0
fi
fi
if ! curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null; then
systemctl restart desineuron-sglang.service
sleep 20
fi
curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null
echo "healthy"
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-watchdog.sh
sudo tee /usr/local/bin/desineuron-sglang-hydrate.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
MODEL_ID="${1:?model id required}"
TARGET_PATH="${2:?target path required}"
mkdir -p "$(dirname "${TARGET_PATH}")"
if command -v hf >/dev/null 2>&1; then
hf download "${MODEL_ID}" --local-dir "${TARGET_PATH}" --max-workers 8
else
python3 - <<PY
from huggingface_hub import snapshot_download
snapshot_download(repo_id="${MODEL_ID}", local_dir="${TARGET_PATH}", max_workers=8)
PY
fi
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-hydrate.sh
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.service >/dev/null <<EOF
[Unit]
Description=Desineuron SGLang Runtime Watchdog
After=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/desineuron-sglang-watchdog.sh
EOF
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.timer >/dev/null <<EOF
[Unit]
Description=Run the Desineuron SGLang watchdog every 5 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
Unit=desineuron-sglang-watchdog.service
[Install]
WantedBy=timers.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-sglang-watchdog.timer
sudo systemctl start desineuron-sglang-watchdog.service
sudo systemctl --no-pager --full status desineuron-sglang-watchdog.timer