Files
Velocity-OS/infrastructure/model-hydration/hydrate_gpu_comfy_models.py

165 lines
4.9 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
DEFAULT_CHECKPOINTS = {
"realvisxlV50_v50LightningBakedvae.safetensors": (
"s3://project-velocity/models/realvisxlV50_v50LightningBakedvae.safetensors"
),
}
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("COMFY_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
else:
tag_key = env("COMFY_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("COMFY_INSTANCE_TAG_VALUE", "comfyui")
reservations = ec2.describe_instances(
Filters=[
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
)["Reservations"]
instances = [
instance
for reservation in reservations
for instance in reservation["Instances"]
if instance["State"]["Name"] == "running"
]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def parse_checkpoints() -> dict[str, str]:
raw = env("COMFY_CHECKPOINTS_JSON")
if not raw:
return dict(DEFAULT_CHECKPOINTS)
parsed = json.loads(raw)
if not isinstance(parsed, dict):
raise ValueError("COMFY_CHECKPOINTS_JSON must be a JSON object of filename to source URI")
return {str(name): str(source) for name, source in parsed.items()}
def remote_hydration_script(checkpoints: dict[str, str]) -> str:
payload = json.dumps(checkpoints)
return f"""#!/usr/bin/env bash
set -euo pipefail
CHECKPOINT_DIR="${{COMFY_CHECKPOINT_DIR:-/opt/dlami/nvme/ComfyUI/models/checkpoints}}"
mkdir -p "$CHECKPOINT_DIR"
if ! mountpoint -q /opt/dlami/nvme; then
echo "GPU NVMe mount /opt/dlami/nvme is not mounted" >&2
exit 2
fi
changed=0
python3 - <<'PY' > /tmp/desineuron-comfy-checkpoints.tsv
import json
for name, source in json.loads({payload!r}).items():
print(f"{{name}}\\t{{source}}")
PY
while IFS=$'\\t' read -r filename source; do
target="$CHECKPOINT_DIR/$filename"
if [ ! -s "$target" ]; then
tmp="$target.part"
rm -f "$tmp"
aws s3 cp "$source" "$tmp" --no-progress
mv "$tmp" "$target"
chmod 0644 "$target"
changed=1
fi
done < /tmp/desineuron-comfy-checkpoints.tsv
rm -f /tmp/desineuron-comfy-checkpoints.tsv
if [ "$changed" = "1" ]; then
sudo systemctl restart comfyui
fi
sleep 3
curl -fsS http://127.0.0.1:8188/models/checkpoints
"""
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
key_path = env(
"GPU_SSH_KEY_PATH",
ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem"),
)
if key_path.startswith("/app/state/"):
key_path = key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
ssh_user = env("GPU_SSH_USER", "ubuntu")
ec2 = boto3.client("ec2", region_name=os.environ["AWS_DEFAULT_REGION"])
instance = resolve_target_instance(ec2)
if not instance:
print("No running ComfyUI GPU instance found", file=sys.stderr)
return 1
target_host = instance.get("PublicIpAddress") or instance.get("PrivateIpAddress")
if not target_host:
print("Target GPU instance has no reachable IP", file=sys.stderr)
return 1
checkpoints = parse_checkpoints()
command = [
"sudo",
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=15",
"-i",
key_path,
f"{ssh_user}@{target_host}",
"bash -s",
]
result = subprocess.run(
command,
input=remote_hydration_script(checkpoints),
text=True,
capture_output=True,
check=False,
)
if result.stdout:
print(result.stdout.strip())
if result.returncode != 0:
if result.stderr:
print(result.stderr.strip(), file=sys.stderr)
return result.returncode
return 0
if __name__ == "__main__":
raise SystemExit(main())