Initial commit: Velocity-OS migration

This commit is contained in:
2026-05-01 12:32:19 +05:30
commit 407af828d4
283 changed files with 207782 additions and 0 deletions

View File

@@ -0,0 +1,147 @@
# ============================================================
# Velocity-OS — GitLab CI/CD Pipeline
# Build → Sign → Push to ECR → Notify Ingress Box
# ============================================================
stages:
- lint
- build
- sign
- notify
variables:
DOCKER_DRIVER: overlay2
DOCKER_BUILDKIT: "1"
AWS_REGION: "ap-south-1"
ECR_REGISTRY: "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
IMAGE_TAG: "${CI_COMMIT_SHORT_SHA}"
# ── Lint + Type Check ─────────────────────────────────────────
lint:webos:
stage: lint
image: node:20-alpine
cache:
key: "${CI_COMMIT_REF_SLUG}-node"
paths: [webos/node_modules/]
script:
- cd webos && npm ci && npm run type-check && npm run lint
rules:
- changes: [webos/**/*]
lint:core:
stage: lint
image: python:3.11-slim
script:
- pip install ruff mypy -q
- cd core && ruff check . && mypy . --ignore-missing-imports
rules:
- changes: [core/**/*]
# ── Build + Push Images ───────────────────────────────────────
.build_template: &build_template
stage: build
image: docker:24-dind
services: [docker:24-dind]
before_script:
- aws ecr get-login-password --region $AWS_REGION |
docker login --username AWS --password-stdin $ECR_REGISTRY
script:
- |
docker build \
--cache-from ${ECR_REGISTRY}/velocity-os/${SERVICE}:latest \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--label git.sha=${CI_COMMIT_SHA} \
--label git.ref=${CI_COMMIT_REF_NAME} \
-t ${ECR_REGISTRY}/velocity-os/${SERVICE}:${IMAGE_TAG} \
-t ${ECR_REGISTRY}/velocity-os/${SERVICE}:latest \
./${SERVICE}
docker push ${ECR_REGISTRY}/velocity-os/${SERVICE}:${IMAGE_TAG}
docker push ${ECR_REGISTRY}/velocity-os/${SERVICE}:latest
build:core:
<<: *build_template
variables:
SERVICE: core
rules:
- changes: [core/**/*]
build:webos:
<<: *build_template
variables:
SERVICE: webos
rules:
- changes: [webos/**/*]
build:media-engine:
<<: *build_template
variables:
SERVICE: media-engine
rules:
- changes: [media-engine/**/*]
build:agents:
<<: *build_template
variables:
SERVICE: agents
rules:
- changes: [agents/**/*]
# ── Sign Images with cosign ───────────────────────────────────
.sign_template: &sign_template
stage: sign
image: ghcr.io/sigstore/cosign:v2.4.0
script:
- |
IMAGE="${ECR_REGISTRY}/velocity-os/${SERVICE}:${IMAGE_TAG}"
DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' $IMAGE || \
aws ecr describe-images \
--repository-name velocity-os/${SERVICE} \
--image-ids imageTag=${IMAGE_TAG} \
--region ${AWS_REGION} \
--query 'imageDetails[0].imageDigest' --output text)
cosign sign --yes "${ECR_REGISTRY}/velocity-os/${SERVICE}@${DIGEST}"
sign:core:
<<: *sign_template
variables:
SERVICE: core
needs: [build:core]
sign:webos:
<<: *sign_template
variables:
SERVICE: webos
needs: [build:webos]
sign:media-engine:
<<: *sign_template
variables:
SERVICE: media-engine
needs: [build:media-engine]
sign:agents:
<<: *sign_template
variables:
SERVICE: agents
needs: [build:agents]
# ── Notify Ingress Box ────────────────────────────────────────
notify:ingress-box:
stage: notify
image: alpine:latest
before_script:
- apk add --no-cache curl openssh-client
script:
# Trigger the poll_and_transfer.sh on the ingress box via SSH
# INGRESS_BOX_IP and SSH key set in GitLab CI/CD variables
- |
ssh -i "${INGRESS_SSH_KEY_FILE}" \
-o StrictHostKeyChecking=no \
ubuntu@${INGRESS_BOX_IP} \
"sudo systemctl start velocity-ingress-poll.service"
needs:
- sign:core
- sign:webos
- sign:media-engine
- sign:agents
rules:
- if: '$CI_COMMIT_BRANCH == "main"'

View File

@@ -0,0 +1,91 @@
#!/usr/bin/env bash
# ============================================================
# Velocity-OS — ECR Registry Provisioner + Image Push Script
# Assumes: aws cli v2, docker, cosign installed on build host
# Run from the Velocity-OS repo root in CI or locally.
# ============================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────
AWS_REGION="${AWS_REGION:-ap-south-1}"
AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID:?Must set AWS_ACCOUNT_ID}"
ECR_REGISTRY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
REGISTRY_PREFIX="velocity-os"
# Image tags from git (deterministic, immutable)
GIT_SHA=$(git rev-parse --short HEAD)
GIT_TAG=$(git tag --points-at HEAD | head -n1 || echo "")
IMAGE_TAG="${GIT_TAG:-$GIT_SHA}"
SERVICES=("core" "webos" "media-engine" "agents")
# ── Step 1: Provision ECR repositories (idempotent) ──────────
echo "=== Provisioning ECR repositories ==="
for svc in "${SERVICES[@]}"; do
REPO_NAME="${REGISTRY_PREFIX}/${svc}"
echo " Ensuring repo: ${REPO_NAME}"
aws ecr describe-repositories \
--repository-names "${REPO_NAME}" \
--region "${AWS_REGION}" \
--no-cli-pager \
> /dev/null 2>&1 || \
aws ecr create-repository \
--repository-name "${REPO_NAME}" \
--region "${AWS_REGION}" \
--image-scanning-configuration scanOnPush=true \
--image-tag-mutability IMMUTABLE \
--encryption-configuration encryptionType=AES256 \
--no-cli-pager
done
# ── Step 2: ECR Login ─────────────────────────────────────────
echo "=== Authenticating to ECR ==="
aws ecr get-login-password --region "${AWS_REGION}" | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
# ── Step 3: Build + Push + Sign each image ───────────────────
echo "=== Building, pushing, and signing images ==="
for svc in "${SERVICES[@]}"; do
LOCAL_IMAGE="velocity-os/${svc}:${IMAGE_TAG}"
REMOTE_IMAGE="${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}:${IMAGE_TAG}"
REMOTE_LATEST="${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}:latest"
echo ""
echo "--- Service: ${svc} ---"
# Build
echo " Building ${LOCAL_IMAGE}..."
docker build \
--cache-from "${REMOTE_LATEST}" \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--label "git.sha=${GIT_SHA}" \
--label "git.tag=${GIT_TAG}" \
--label "build.date=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
-t "${LOCAL_IMAGE}" \
-t "${REMOTE_IMAGE}" \
-t "${REMOTE_LATEST}" \
"./${svc}"
# Push (sha-tagged first for immutability, then latest)
echo " Pushing ${REMOTE_IMAGE}..."
docker push "${REMOTE_IMAGE}"
docker push "${REMOTE_LATEST}"
# Sign with cosign (Sigstore keyless or KMS key)
echo " Signing ${REMOTE_IMAGE} with cosign..."
IMAGE_DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' "${REMOTE_IMAGE}" || \
aws ecr describe-images \
--repository-name "${REGISTRY_PREFIX}/${svc}" \
--image-ids imageTag="${IMAGE_TAG}" \
--region "${AWS_REGION}" \
--query 'imageDetails[0].imageDigest' \
--output text)
cosign sign --yes "${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}@${IMAGE_DIGEST}"
echo "${svc} pushed and signed: ${REMOTE_IMAGE}"
done
echo ""
echo "=== All images built, pushed, and signed. ==="
echo "ECR Registry: ${ECR_REGISTRY}"
echo "Image tag: ${IMAGE_TAG}"

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env bash
# ============================================================
# Velocity-OS — Ingress Box: Air-Gap Transfer Agent
# Runs on a LAN-connected node (Raspberry Pi / VM).
# Polls ECR every 5 minutes for new signed images.
# Verifies cosign signature. Transfers to air-gapped workstation.
# Triggers K3s rolling restart on new image.
#
# Install as systemd service:
# sudo cp poll_and_transfer.service /etc/systemd/system/
# sudo systemctl enable --now poll_and_transfer
# ============================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────
AWS_REGION="${AWS_REGION:-ap-south-1}"
AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID:?Must set AWS_ACCOUNT_ID}"
ECR_REGISTRY="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
REGISTRY_PREFIX="velocity-os"
SERVICES=("core" "webos" "media-engine" "agents")
# Air-gapped workstation (LAN only — no internet)
WORKSTATION_IP="${WORKSTATION_IP:-192.168.1.100}"
WORKSTATION_USER="${WORKSTATION_USER:-ubuntu}"
WORKSTATION_SSH_KEY="${WORKSTATION_SSH_KEY:-/home/ingress/.ssh/velocity_workstation_ed25519}"
# State file: tracks last-transferred digest per service
STATE_DIR="/var/lib/velocity-ingress"
mkdir -p "${STATE_DIR}"
# Temp dir for image tarballs
TRANSFER_DIR="/tmp/velocity-transfer"
mkdir -p "${TRANSFER_DIR}"
# ── Functions ─────────────────────────────────────────────────
log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] $*"; }
get_latest_digest() {
local repo="${REGISTRY_PREFIX}/$1"
aws ecr describe-images \
--repository-name "${repo}" \
--image-ids imageTag=latest \
--region "${AWS_REGION}" \
--query 'imageDetails[0].imageDigest' \
--output text 2>/dev/null || echo "NONE"
}
transfer_image() {
local svc="$1"
local digest="$2"
local full_image="${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}@${digest}"
local tar_file="${TRANSFER_DIR}/${svc}.tar"
log " [${svc}] Pulling from ECR..."
docker pull "${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}:latest"
log " [${svc}] Verifying cosign signature..."
cosign verify \
--certificate-identity-regexp ".*" \
--certificate-oidc-issuer-regexp ".*" \
"${full_image}" || {
log " [${svc}] ERROR: Signature verification FAILED. Refusing transfer."
return 1
}
log " [${svc}] Saving image to tarball..."
docker save "${ECR_REGISTRY}/${REGISTRY_PREFIX}/${svc}:latest" \
-o "${tar_file}"
log " [${svc}] Transferring to workstation via SCP..."
scp -i "${WORKSTATION_SSH_KEY}" \
-o StrictHostKeyChecking=yes \
"${tar_file}" \
"${WORKSTATION_USER}@${WORKSTATION_IP}:/tmp/${svc}.tar"
log " [${svc}] Importing into K3s containerd + rolling restart..."
ssh -i "${WORKSTATION_SSH_KEY}" \
-o StrictHostKeyChecking=yes \
"${WORKSTATION_USER}@${WORKSTATION_IP}" \
"sudo k3s ctr images import /tmp/${svc}.tar && \
sudo kubectl rollout restart deployment/${svc} -n velocity-os && \
rm /tmp/${svc}.tar"
# Record transferred digest
echo "${digest}" > "${STATE_DIR}/${svc}.last_digest"
log " [${svc}] ✓ Transfer complete. Digest: ${digest}"
rm -f "${tar_file}"
}
# ── Main poll loop ────────────────────────────────────────────
log "=== Velocity-OS Ingress Box polling ECR ==="
# Login to ECR (token expires every 12h; cron re-runs this)
aws ecr get-login-password --region "${AWS_REGION}" | \
docker login --username AWS --password-stdin "${ECR_REGISTRY}"
for svc in "${SERVICES[@]}"; do
log "[${svc}] Checking for updates..."
CURRENT_DIGEST=$(get_latest_digest "${svc}")
LAST_DIGEST=$(cat "${STATE_DIR}/${svc}.last_digest" 2>/dev/null || echo "NONE")
if [[ "${CURRENT_DIGEST}" == "NONE" ]]; then
log " [${svc}] No image found in ECR. Skipping."
continue
fi
if [[ "${CURRENT_DIGEST}" == "${LAST_DIGEST}" ]]; then
log " [${svc}] Up to date. No transfer needed."
continue
fi
log " [${svc}] New digest detected: ${CURRENT_DIGEST}"
transfer_image "${svc}" "${CURRENT_DIGEST}" || \
log " [${svc}] Transfer FAILED. Will retry next cycle."
done
log "=== Poll cycle complete ==="

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("COMFY_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
for reservation in reservations:
for instance in reservation["Instances"]:
if instance["State"]["Name"] == "running":
return instance
return None
tag_key = env("COMFY_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("COMFY_INSTANCE_TAG_VALUE", "comfyui")
filters = [
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
reservations = ec2.describe_instances(Filters=filters)["Reservations"]
instances = [instance for reservation in reservations for instance in reservation["Instances"]]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def upsert_route(hostname: str, private_ip: str, port: int) -> subprocess.CompletedProcess[str]:
ingress_host = env("INGRESS_SSH_HOST")
ingress_user = env("INGRESS_SSH_USER", "ec2-user")
ingress_port = env("INGRESS_SSH_PORT", "22")
ingress_key = env("INGRESS_SSH_KEY_PATH")
helper = env("INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
payload = json.dumps(
{
"hostname": hostname,
"scheme": "http",
"target_host": private_ip,
"target_port": port,
}
)
command = (
f"sudo {helper} upsert '{payload}'"
" && sudo caddy validate --config /etc/caddy/Caddyfile"
" && sudo systemctl reload caddy"
)
return subprocess.run(
[
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-i",
ingress_key,
"-p",
ingress_port,
f"{ingress_user}@{ingress_host}",
command,
],
capture_output=True,
text=True,
check=False,
)
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
os.environ.setdefault("INGRESS_SSH_HOST", ops_env.get("OPS_INGRESS_SSH_HOST", ""))
os.environ.setdefault("INGRESS_SSH_USER", ops_env.get("OPS_INGRESS_SSH_USER", "ec2-user"))
os.environ.setdefault("INGRESS_SSH_PORT", ops_env.get("OPS_INGRESS_SSH_PORT", "22"))
normalized_key_path = ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem")
if normalized_key_path.startswith("/app/state/"):
normalized_key_path = normalized_key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
os.environ.setdefault("INGRESS_SSH_KEY_PATH", normalized_key_path)
os.environ.setdefault("INGRESS_ROUTE_HELPER", ops_env.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py"))
region = os.environ["AWS_DEFAULT_REGION"]
hostname = env("COMFY_ROUTE_HOSTNAME", "comfy.desineuron.in")
port = int(env("COMFY_ROUTE_PORT", "8188"))
state_file = Path(env("COMFY_ROUTE_STATE_FILE", "/var/lib/desineuron-comfy-route-sync/current_target.txt"))
ec2 = boto3.client("ec2", region_name=region)
instance = resolve_target_instance(ec2)
if not instance:
print("No running comfyui target instance found", file=sys.stderr)
return 1
private_ip = instance.get("PrivateIpAddress")
if not private_ip:
print("Target instance has no private IP", file=sys.stderr)
return 1
current = state_file.read_text(encoding="utf-8").strip() if state_file.exists() else ""
if current == private_ip:
print(json.dumps({"status": "noop", "hostname": hostname, "target_host": private_ip}))
return 0
result = upsert_route(hostname, private_ip, port)
if result.returncode != 0:
print(result.stdout)
print(result.stderr, file=sys.stderr)
return result.returncode
state_file.parent.mkdir(parents=True, exist_ok=True)
state_file.write_text(private_ip, encoding="utf-8")
print(json.dumps({"status": "updated", "hostname": hostname, "target_host": private_ip}))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,152 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("LLM_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
for reservation in reservations:
for instance in reservation["Instances"]:
if instance["State"]["Name"] == "running":
return instance
return None
# We assume the LLM runtime runs on the same GPU instance as comfyui initially
tag_key = env("LLM_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("LLM_INSTANCE_TAG_VALUE", "comfyui")
filters = [
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
reservations = ec2.describe_instances(Filters=filters)["Reservations"]
instances = [instance for reservation in reservations for instance in reservation["Instances"]]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def upsert_route(hostname: str, private_ip: str, port: int) -> subprocess.CompletedProcess[str]:
ingress_host = env("INGRESS_SSH_HOST")
ingress_user = env("INGRESS_SSH_USER", "ec2-user")
ingress_port = env("INGRESS_SSH_PORT", "22")
ingress_key = env("INGRESS_SSH_KEY_PATH")
helper = env("INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
payload = json.dumps(
{
"hostname": hostname,
"scheme": "http",
"target_host": private_ip,
"target_port": port,
}
)
command = (
f"sudo {helper} upsert '{payload}'"
" && sudo caddy validate --config /etc/caddy/Caddyfile"
" && sudo systemctl reload caddy"
)
return subprocess.run(
[
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-i",
ingress_key,
"-p",
ingress_port,
f"{ingress_user}@{ingress_host}",
command,
],
capture_output=True,
text=True,
check=False,
)
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
os.environ.setdefault("INGRESS_SSH_HOST", ops_env.get("OPS_INGRESS_SSH_HOST", ""))
os.environ.setdefault("INGRESS_SSH_USER", ops_env.get("OPS_INGRESS_SSH_USER", "ec2-user"))
os.environ.setdefault("INGRESS_SSH_PORT", ops_env.get("OPS_INGRESS_SSH_PORT", "22"))
normalized_key_path = ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem")
if normalized_key_path.startswith("/app/state/"):
normalized_key_path = normalized_key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
os.environ.setdefault("INGRESS_SSH_KEY_PATH", normalized_key_path)
os.environ.setdefault("INGRESS_ROUTE_HELPER", ops_env.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py"))
region = os.environ["AWS_DEFAULT_REGION"]
hostname = env("LLM_ROUTE_HOSTNAME", "llm.desineuron.in")
port = int(env("LLM_ROUTE_PORT", "11434"))
state_file = Path(env("LLM_ROUTE_STATE_FILE", "/var/lib/desineuron-llm-route-sync/current_target.txt"))
ec2 = boto3.client("ec2", region_name=region)
instance = resolve_target_instance(ec2)
if not instance:
print("No running LLM target instance found", file=sys.stderr)
return 1
private_ip = instance.get("PrivateIpAddress")
if not private_ip:
print("Target instance has no private IP", file=sys.stderr)
return 1
desired_state = f"{private_ip}:{port}"
current = state_file.read_text(encoding="utf-8").strip() if state_file.exists() else ""
if current == desired_state:
print(
json.dumps(
{"status": "noop", "hostname": hostname, "target_host": private_ip, "target_port": port}
)
)
return 0
result = upsert_route(hostname, private_ip, port)
if result.returncode != 0:
print(result.stdout)
print(result.stderr, file=sys.stderr)
return result.returncode
state_file.parent.mkdir(parents=True, exist_ok=True)
state_file.write_text(desired_state, encoding="utf-8")
print(
json.dumps(
{"status": "updated", "hostname": hostname, "target_host": private_ip, "target_port": port}
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("VELOCITY_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
for reservation in reservations:
for instance in reservation["Instances"]:
if instance["State"]["Name"] == "running":
return instance
return None
tag_key = env("VELOCITY_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("VELOCITY_INSTANCE_TAG_VALUE", "velocity-backend")
filters = [
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
reservations = ec2.describe_instances(Filters=filters)["Reservations"]
instances = [instance for reservation in reservations for instance in reservation["Instances"]]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def upsert_route(hostname: str, private_ip: str, port: int) -> subprocess.CompletedProcess[str]:
ingress_host = env("INGRESS_SSH_HOST")
ingress_user = env("INGRESS_SSH_USER", "ec2-user")
ingress_port = env("INGRESS_SSH_PORT", "22")
ingress_key = env("INGRESS_SSH_KEY_PATH")
helper = env("INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
payload = json.dumps(
{
"hostname": hostname,
"scheme": "http",
"target_host": private_ip,
"target_port": port,
}
)
command = (
f"sudo {helper} upsert '{payload}'"
" && sudo caddy validate --config /etc/caddy/Caddyfile"
" && sudo systemctl reload caddy"
)
return subprocess.run(
[
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-i",
ingress_key,
"-p",
ingress_port,
f"{ingress_user}@{ingress_host}",
command,
],
capture_output=True,
text=True,
check=False,
)
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
os.environ.setdefault("INGRESS_SSH_HOST", ops_env.get("OPS_INGRESS_SSH_HOST", ""))
os.environ.setdefault("INGRESS_SSH_USER", ops_env.get("OPS_INGRESS_SSH_USER", "ec2-user"))
os.environ.setdefault("INGRESS_SSH_PORT", ops_env.get("OPS_INGRESS_SSH_PORT", "22"))
normalized_key_path = ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem")
if normalized_key_path.startswith("/app/state/"):
normalized_key_path = normalized_key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
os.environ.setdefault("INGRESS_SSH_KEY_PATH", normalized_key_path)
os.environ.setdefault("INGRESS_ROUTE_HELPER", ops_env.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py"))
region = os.environ["AWS_DEFAULT_REGION"]
hostname = env("VELOCITY_ROUTE_HOSTNAME", "api.desineuron.in")
port = int(env("VELOCITY_ROUTE_PORT", "8001"))
state_file = Path(env("VELOCITY_ROUTE_STATE_FILE", "/var/lib/desineuron-velocity-route-sync/current_target.txt"))
ec2 = boto3.client("ec2", region_name=region)
instance = resolve_target_instance(ec2)
if not instance:
print("No running velocity-backend target instance found", file=sys.stderr)
return 1
private_ip = instance.get("PrivateIpAddress")
if not private_ip:
print("Target instance has no private IP", file=sys.stderr)
return 1
current = state_file.read_text(encoding="utf-8").strip() if state_file.exists() else ""
if current == private_ip:
print(json.dumps({"status": "noop", "hostname": hostname, "target_host": private_ip}))
return 0
result = upsert_route(hostname, private_ip, port)
if result.returncode != 0:
print(result.stdout)
print(result.stderr, file=sys.stderr)
return result.returncode
state_file.parent.mkdir(parents=True, exist_ok=True)
state_file.write_text(private_ip, encoding="utf-8")
print(json.dumps({"status": "updated", "hostname": hostname, "target_host": private_ip}))
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,342 @@
# ============================================================
# Velocity-OS — K3s Deployments
# All services in velocity-os namespace.
# GPU: RTX 6000 Blackwell 96GB VRAM — MIG partitioned.
# MIG slice 0 (48GB): SGLang LLM inference (core-api)
# MIG slice 1 (48GB): ComfyUI media generation (media-engine)
# ============================================================
---
# ── PostgreSQL (StatefulSet for stable identity) ─────────────
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: velocity-os
labels:
app: postgres
tier: database
spec:
serviceName: postgres
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
tier: database
spec:
containers:
- name: postgres
image: ${ECR_REGISTRY}/postgres:15-alpine
ports:
- containerPort: 5432
env:
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_DB
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_PASSWORD
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command: [pg_isready, -U, velocity]
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command: [pg_isready, -U, velocity]
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: postgres-data
persistentVolumeClaim:
claimName: pvc-postgres-data
---
# ── Redis (session cache, future queue) ──────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: velocity-os
labels:
app: redis
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: ${ECR_REGISTRY}/redis:7-alpine
ports:
- containerPort: 6379
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "250m"
args: ["--maxmemory", "400mb", "--maxmemory-policy", "allkeys-lru"]
---
# ── Core API (FastAPI) ────────────────────────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: core-api
namespace: velocity-os
labels:
app: core-api
tier: backend
spec:
replicas: 2
selector:
matchLabels:
app: core-api
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
template:
metadata:
labels:
app: core-api
tier: backend
spec:
# MIG slice 0: SGLang LLM inference
# The core-api pod requests MIG slice via resource limit
runtimeClassName: nvidia
containers:
- name: core-api
image: ${ECR_REGISTRY}/velocity-os/core:latest
ports:
- containerPort: 8443
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config
resources:
requests:
memory: "1Gi"
cpu: "500m"
# RTX 6000 Blackwell MIG 3g.48gb (SGLang slice)
nvidia.com/mig-3g.48gb: "1"
limits:
memory: "2Gi"
cpu: "1000m"
nvidia.com/mig-3g.48gb: "1"
volumeMounts:
- name: asset-store
mountPath: /opt/assets
- name: model-cache
mountPath: /opt/models
readOnly: true
livenessProbe:
httpGet:
path: /health
port: 8443
initialDelaySeconds: 20
periodSeconds: 15
readinessProbe:
httpGet:
path: /health
port: 8443
initialDelaySeconds: 10
periodSeconds: 5
volumes:
- name: asset-store
persistentVolumeClaim:
claimName: pvc-asset-store
- name: model-cache
persistentVolumeClaim:
claimName: pvc-model-cache
---
# ── WebOS (Nginx static + React) ─────────────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: webos
namespace: velocity-os
labels:
app: webos
tier: frontend
spec:
replicas: 2
selector:
matchLabels:
app: webos
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
template:
metadata:
labels:
app: webos
tier: frontend
spec:
containers:
- name: webos
image: ${ECR_REGISTRY}/velocity-os/webos:latest
ports:
- containerPort: 80
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "250m"
livenessProbe:
httpGet:
path: /health.txt
port: 80
initialDelaySeconds: 5
periodSeconds: 10
---
# ── Media Engine (Dream Weaver Gateway) ──────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: media-engine
namespace: velocity-os
labels:
app: media-engine
tier: ai
spec:
replicas: 1
selector:
matchLabels:
app: media-engine
template:
metadata:
labels:
app: media-engine
tier: ai
spec:
# MIG slice 1: ComfyUI media generation
runtimeClassName: nvidia
containers:
- name: media-engine
image: ${ECR_REGISTRY}/velocity-os/media-engine:latest
ports:
- containerPort: 8290
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config
resources:
requests:
memory: "2Gi"
cpu: "1000m"
# RTX 6000 Blackwell MIG 3g.48gb (ComfyUI slice)
nvidia.com/mig-3g.48gb: "1"
limits:
memory: "4Gi"
cpu: "2000m"
nvidia.com/mig-3g.48gb: "1"
volumeMounts:
- name: model-cache
mountPath: /opt/models
readOnly: true
- name: asset-store
mountPath: /opt/assets
livenessProbe:
httpGet:
path: /health
port: 8290
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: pvc-model-cache
- name: asset-store
persistentVolumeClaim:
claimName: pvc-asset-store
---
# ── DB Init Job (runs once: schema apply + seed) ─────────────
apiVersion: batch/v1
kind: Job
metadata:
name: db-init
namespace: velocity-os
labels:
app: db-init
spec:
# Never auto-restart; operator re-runs manually if needed
backoffLimit: 0
template:
metadata:
labels:
app: db-init
spec:
restartPolicy: Never
initContainers:
# Wait for postgres to be ready before running init
- name: wait-for-postgres
image: ${ECR_REGISTRY}/postgres:15-alpine
command: [sh, -c, "until pg_isready -h postgres -U $(POSTGRES_USER); do echo waiting...; sleep 2; done"]
envFrom:
- secretRef:
name: velocity-secrets
containers:
- name: db-init
image: ${ECR_REGISTRY}/velocity-os/core:latest
command:
- sh
- -c
- |
echo "=== Applying schemas ==="
psql $DATABASE_URL -f /app/db/schema.sql
psql $DATABASE_URL -f /app/db/schema_addendum.sql
psql $DATABASE_URL -f /app/db/schema_comms.sql
psql $DATABASE_URL -f /app/db/schema_crm_canonical.sql
psql $DATABASE_URL -f /app/oracle/schema_oracle.sql
psql $DATABASE_URL -f /app/oracle/schema_extension_v2.sql
echo "=== Seeding synthetic CRM v2 ==="
python /app/scripts/seed_synthetic_crm.py
echo "=== DB init complete ==="
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config

View File

@@ -0,0 +1,107 @@
# ============================================================
# Velocity-OS — NVIDIA MIG Configuration for K3s
# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
#
# MIG Strategy: Partition 96GB into two equal 48GB slices:
# MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
# MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
#
# Result: Concurrent zero-contention GPU execution.
# No operator toggle required.
#
# Prerequisites on workstation:
# - nvidia-driver >= 550
# - CUDA >= 12.4
# - k3s with nvidia-container-toolkit
# - NVIDIA device plugin with MIG support
# ============================================================
# ── Step 1: Enable MIG mode on the GPU ───────────────────────
# Run on workstation (one-time, survives reboot via service):
# sudo nvidia-smi -i 0 --mig-mode=ENABLE
# sudo reboot
# ── Step 2: Create MIG instances ─────────────────────────────
# Run after reboot:
# sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
# This creates:
# GPU instance 0: 3g.48gb (48GB) → MIG device 0
# GPU instance 1: 3g.48gb (48GB) → MIG device 1
# Verify: nvidia-smi -L
---
# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin
namespace: kube-system
labels:
app: nvidia-device-plugin
spec:
selector:
matchLabels:
app: nvidia-device-plugin
template:
metadata:
labels:
app: nvidia-device-plugin
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- name: nvidia-device-plugin
image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
env:
# "mixed" strategy: expose both full GPU and MIG devices
- name: MIG_STRATEGY
value: "mixed"
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ALL]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
---
# ── Node label: MIG-capable workstation ──────────────────────
# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
# This ensures GPU pods only schedule on the correct node.
apiVersion: v1
kind: ConfigMap
metadata:
name: mig-setup-instructions
namespace: velocity-os
data:
README: |
RTX 6000 Blackwell MIG Setup (run on workstation before deploying):
1. Enable MIG mode:
sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot
2. Create two 3g.48gb instances (post-reboot):
sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
3. Label K3s node:
kubectl label node velocity-workstation \
nvidia.com/mig.strategy=mixed \
kubernetes.io/hostname=velocity-workstation
4. Verify resource availability:
kubectl describe node velocity-workstation | grep nvidia
Expected output:
nvidia.com/mig-3g.48gb: 2 (2 slices available)
Deployment assignments:
core-api → nvidia.com/mig-3g.48gb: 1 (SGLang, slice 0)
media-engine → nvidia.com/mig-3g.48gb: 1 (ComfyUI, slice 1)

View File

@@ -0,0 +1,64 @@
{
email admin@desineuron.in
log {
output file /var/log/caddy/admin.log
format json
}
}
office.desineuron.in, git.desineuron.in, cloud.desineuron.in, projects.desineuron.in, talk.desineuron.in, vpn.desineuron.in {
tls /etc/caddy/tls/fullchain.pem /etc/caddy/tls/privkey.pem
log {
output file /var/log/caddy/access.log
format json
}
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
velocity.desineuron.in {
log {
output file /var/log/caddy/access.log
format json
}
import /etc/caddy/managed/llm_upstream.caddy_inc
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
ops.desineuron.in {
log {
output file /var/log/caddy/access.log
format json
}
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
import /etc/caddy/managed/*.caddy

View File

@@ -0,0 +1,158 @@
# ============================================================
# Velocity-OS — K3s Traefik Ingress
# Domain: velocity.local | TLS: self-signed via cert-manager
# ============================================================
# ── cert-manager ClusterIssuer (self-signed for velocity.local) ──
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: velocity-selfsigned-issuer
spec:
selfSigned: {}
---
# Self-signed CA Certificate
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: velocity-local-ca
namespace: velocity-infra
spec:
isCA: true
commonName: velocity-local-ca
secretName: velocity-local-ca-secret
privateKey:
algorithm: ECDSA
size: 256
issuerRef:
name: velocity-selfsigned-issuer
kind: ClusterIssuer
group: cert-manager.io
---
# CA-backed ClusterIssuer for velocity.local
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: velocity-ca-issuer
spec:
ca:
secretName: velocity-local-ca-secret
---
# TLS Certificate for velocity.local
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: velocity-local-tls
namespace: velocity-os
spec:
secretName: velocity-local-tls-secret
duration: 8760h # 1 year
renewBefore: 720h # renew 30 days before expiry
subject:
organizations: [Desineuron]
commonName: velocity.local
dnsNames:
- velocity.local
- "*.velocity.local"
issuerRef:
name: velocity-ca-issuer
kind: ClusterIssuer
group: cert-manager.io
---
# ── Main Ingress ─────────────────────────────────────────────
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: velocity-os-ingress
namespace: velocity-os
annotations:
# Traefik (K3s built-in)
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
# WebSocket support (Sentinel, Oracle canvas, Catalyst)
traefik.ingress.kubernetes.io/router.middlewares: velocity-os-ws-headers@kubernetescrd
spec:
tls:
- hosts:
- velocity.local
secretName: velocity-local-tls-secret
rules:
- host: velocity.local
http:
paths:
# API (FastAPI backend)
- path: /api
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# WebSockets (must route before generic /api catch)
- path: /ws
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# Dream Weaver gateway
- path: /dream-weaver
pathType: Prefix
backend:
service:
name: media-engine
port:
number: 8290
# Vault public links (no auth)
- path: /vault
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# WebOS (React SPA — catch-all last)
- path: /
pathType: Prefix
backend:
service:
name: webos
port:
number: 80
---
# ── Traefik Middleware: WebSocket upgrade headers ─────────────
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: ws-headers
namespace: velocity-os
spec:
headers:
customRequestHeaders:
Connection: "Upgrade"
Upgrade: "websocket"
---
# ── Traefik Middleware: Security headers ─────────────────────
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: security-headers
namespace: velocity-os
spec:
headers:
stsSeconds: 31536000
stsIncludeSubdomains: true
forceSTSHeader: true
contentTypeNosniff: true
browserXssFilter: true
referrerPolicy: strict-origin-when-cross-origin
frameDeny: true

View File

@@ -0,0 +1,27 @@
# ============================================================
# Velocity-OS — K3s Namespaces
# ============================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-os
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-agents
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-infra
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production

View File

@@ -0,0 +1,82 @@
# ============================================================
# Velocity-OS — K3s Services
# ClusterIP for internal, none for headless StatefulSet
# ============================================================
---
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: velocity-os
labels:
app: postgres
spec:
clusterIP: None # Headless for StatefulSet stable DNS
selector:
app: postgres
ports:
- port: 5432
targetPort: 5432
---
apiVersion: v1
kind: Service
metadata:
name: redis
namespace: velocity-os
labels:
app: redis
spec:
type: ClusterIP
selector:
app: redis
ports:
- port: 6379
targetPort: 6379
---
apiVersion: v1
kind: Service
metadata:
name: core-api
namespace: velocity-os
labels:
app: core-api
spec:
type: ClusterIP
selector:
app: core-api
ports:
- name: http
port: 8443
targetPort: 8443
---
apiVersion: v1
kind: Service
metadata:
name: webos
namespace: velocity-os
labels:
app: webos
spec:
type: ClusterIP
selector:
app: webos
ports:
- name: http
port: 80
targetPort: 80
---
apiVersion: v1
kind: Service
metadata:
name: media-engine
namespace: velocity-os
labels:
app: media-engine
spec:
type: ClusterIP
selector:
app: media-engine
ports:
- name: http
port: 8290
targetPort: 8290

View File

@@ -0,0 +1,132 @@
# ============================================================
# Velocity-OS — K3s StorageClasses + PersistentVolumes
# Target: RTX 6000 Blackwell workstation NVMe drive
# ============================================================
---
# StorageClass: local-nvme (no provisioner — manually bound PVs)
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-nvme
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Retain
---
# PV: PostgreSQL data (50Gi on NVMe)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-postgres-data
labels:
app: postgres
spec:
capacity:
storage: 50Gi
accessModes: [ReadWriteOnce]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/data/postgres
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PV: AI model cache (500Gi — Wan 2.2, Qwen-Image, Qwen3.6)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-model-cache
labels:
app: model-cache
spec:
capacity:
storage: 500Gi
accessModes: [ReadOnlyMany]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/models
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PV: Generated asset store (200Gi)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-asset-store
labels:
app: asset-store
spec:
capacity:
storage: 200Gi
accessModes: [ReadWriteMany]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/assets
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PVCs
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-postgres-data
namespace: velocity-os
spec:
accessModes: [ReadWriteOnce]
storageClassName: local-nvme
resources:
requests:
storage: 50Gi
selector:
matchLabels:
app: postgres
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-model-cache
namespace: velocity-os
spec:
accessModes: [ReadOnlyMany]
storageClassName: local-nvme
resources:
requests:
storage: 500Gi
selector:
matchLabels:
app: model-cache
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-asset-store
namespace: velocity-os
spec:
accessModes: [ReadWriteMany]
storageClassName: local-nvme
resources:
requests:
storage: 200Gi
selector:
matchLabels:
app: asset-store

View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import subprocess
import sys
from pathlib import Path
import boto3
DEFAULT_CHECKPOINTS = {
"realvisxlV50_v50LightningBakedvae.safetensors": (
"s3://project-velocity/models/realvisxlV50_v50LightningBakedvae.safetensors"
),
}
def load_env_file(path: Path) -> dict[str, str]:
data: dict[str, str] = {}
if not path.exists():
return data
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip()
return data
def env(name: str, default: str = "") -> str:
return os.environ.get(name, default)
def resolve_target_instance(ec2) -> dict | None:
explicit_instance_id = env("COMFY_INSTANCE_ID")
if explicit_instance_id:
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
else:
tag_key = env("COMFY_INSTANCE_TAG_KEY", "DesineuronRole")
tag_value = env("COMFY_INSTANCE_TAG_VALUE", "comfyui")
reservations = ec2.describe_instances(
Filters=[
{"Name": "instance-state-name", "Values": ["running"]},
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
]
)["Reservations"]
instances = [
instance
for reservation in reservations
for instance in reservation["Instances"]
if instance["State"]["Name"] == "running"
]
if not instances:
return None
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
return instances[0]
def parse_checkpoints() -> dict[str, str]:
raw = env("COMFY_CHECKPOINTS_JSON")
if not raw:
return dict(DEFAULT_CHECKPOINTS)
parsed = json.loads(raw)
if not isinstance(parsed, dict):
raise ValueError("COMFY_CHECKPOINTS_JSON must be a JSON object of filename to source URI")
return {str(name): str(source) for name, source in parsed.items()}
def remote_hydration_script(checkpoints: dict[str, str]) -> str:
payload = json.dumps(checkpoints)
return f"""#!/usr/bin/env bash
set -euo pipefail
CHECKPOINT_DIR="${{COMFY_CHECKPOINT_DIR:-/opt/dlami/nvme/ComfyUI/models/checkpoints}}"
mkdir -p "$CHECKPOINT_DIR"
if ! mountpoint -q /opt/dlami/nvme; then
echo "GPU NVMe mount /opt/dlami/nvme is not mounted" >&2
exit 2
fi
changed=0
python3 - <<'PY' > /tmp/desineuron-comfy-checkpoints.tsv
import json
for name, source in json.loads({payload!r}).items():
print(f"{{name}}\\t{{source}}")
PY
while IFS=$'\\t' read -r filename source; do
target="$CHECKPOINT_DIR/$filename"
if [ ! -s "$target" ]; then
tmp="$target.part"
rm -f "$tmp"
aws s3 cp "$source" "$tmp" --no-progress
mv "$tmp" "$target"
chmod 0644 "$target"
changed=1
fi
done < /tmp/desineuron-comfy-checkpoints.tsv
rm -f /tmp/desineuron-comfy-checkpoints.tsv
if [ "$changed" = "1" ]; then
sudo systemctl restart comfyui
fi
sleep 3
curl -fsS http://127.0.0.1:8188/models/checkpoints
"""
def main() -> int:
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
if key not in os.environ and key in ops_env:
os.environ[key] = ops_env[key]
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
key_path = env(
"GPU_SSH_KEY_PATH",
ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem"),
)
if key_path.startswith("/app/state/"):
key_path = key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
ssh_user = env("GPU_SSH_USER", "ubuntu")
ec2 = boto3.client("ec2", region_name=os.environ["AWS_DEFAULT_REGION"])
instance = resolve_target_instance(ec2)
if not instance:
print("No running ComfyUI GPU instance found", file=sys.stderr)
return 1
target_host = instance.get("PublicIpAddress") or instance.get("PrivateIpAddress")
if not target_host:
print("Target GPU instance has no reachable IP", file=sys.stderr)
return 1
checkpoints = parse_checkpoints()
command = [
"sudo",
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"ConnectTimeout=15",
"-i",
key_path,
f"{ssh_user}@{target_host}",
"bash -s",
]
result = subprocess.run(
command,
input=remote_hydration_script(checkpoints),
text=True,
capture_output=True,
check=False,
)
if result.stdout:
print(result.stdout.strip())
if result.returncode != 0:
if result.stderr:
print(result.stderr.strip(), file=sys.stderr)
return result.returncode
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# ============================================================
# Velocity-OS — Model Hydration Script
# Uses s5cmd for high-throughput parallel S3 → NVMe sync.
# Run once at initial install; safe to re-run for updates.
#
# Models synced (all stored on NVMe, never in Docker images):
# - Wan 2.2 (ComfyUI video/image model)
# - Qwen-Image 2512 (ComfyUI poster/image model)
# - Qwen3.6 35B A3B (SGLang LLM — MIG slice 0)
#
# Requires: s5cmd, AWS credentials with S3 read access
# ============================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────
S3_BUCKET="${S3_MODEL_BUCKET:-s3://desineuron-models}"
LOCAL_BASE="/opt/dlami/nvme/models"
S5CMD_CONCURRENCY="${S5CMD_CONCURRENCY:-32}" # Tune to NVMe write IOPS
# ── Ensure directories exist ─────────────────────────────────
mkdir -p \
"${LOCAL_BASE}/comfy/wan2.2" \
"${LOCAL_BASE}/comfy/qwen-image-2512" \
"${LOCAL_BASE}/llm/qwen3.6-35b-a3b"
# ── Check s5cmd installed ────────────────────────────────────
if ! command -v s5cmd &> /dev/null; then
echo "ERROR: s5cmd not found. Install from https://github.com/peak/s5cmd"
echo " curl -L https://github.com/peak/s5cmd/releases/latest/download/s5cmd_Linux_x86_64.tar.gz | tar xz -C /usr/local/bin"
exit 1
fi
echo "=== Velocity-OS Model Hydration ==="
echo "Source: ${S3_BUCKET}"
echo "Target: ${LOCAL_BASE}"
echo "s5cmd workers: ${S5CMD_CONCURRENCY}"
echo ""
# ── Wan 2.2 (ComfyUI — MIG slice 1) ──────────────────────────
echo "[1/3] Syncing Wan 2.2..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/wan2.2/*" \
"${LOCAL_BASE}/comfy/wan2.2/"
echo " ✓ Wan 2.2 synced."
# ── Qwen-Image 2512 (ComfyUI — MIG slice 1) ──────────────────
echo "[2/3] Syncing Qwen-Image 2512..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/qwen-image-2512/*" \
"${LOCAL_BASE}/comfy/qwen-image-2512/"
echo " ✓ Qwen-Image 2512 synced."
# ── Qwen3.6 35B A3B (SGLang — MIG slice 0) ───────────────────
echo "[3/3] Syncing Qwen3.6 35B A3B (LLM — ~70GB, be patient)..."
s5cmd \
--numworkers "${S5CMD_CONCURRENCY}" \
--credentials-file /etc/velocity/aws-credentials \
sync \
"${S3_BUCKET}/qwen3.6-35b-a3b/*" \
"${LOCAL_BASE}/llm/qwen3.6-35b-a3b/"
echo " ✓ Qwen3.6 35B synced."
# ── Verify checksums (optional — if .sha256 files exist in S3) ─
echo ""
echo "=== Verifying checksums ==="
for dir in "${LOCAL_BASE}/comfy/wan2.2" "${LOCAL_BASE}/comfy/qwen-image-2512" "${LOCAL_BASE}/llm/qwen3.6-35b-a3b"; do
if ls "${dir}"/*.sha256 2>/dev/null | head -1 | grep -q sha256; then
echo " Checking ${dir}..."
(cd "${dir}" && sha256sum -c ./*.sha256 --quiet) && echo "${dir} checksums OK"
fi
done
echo ""
echo "=== Model hydration complete ==="
echo "NVMe usage:"
du -sh "${LOCAL_BASE}"/*/* 2>/dev/null || true

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STOP_SGLANG_FOR_COMFY_POOL="${STOP_SGLANG_FOR_COMFY_POOL:-0}"
WORKERS="${COMFY_WORKER_COUNT:-4}"
if [[ ! -d /opt/dlami/nvme/ComfyUI ]]; then
echo "Missing ComfyUI at /opt/dlami/nvme/ComfyUI" >&2
exit 1
fi
if ! mountpoint -q /opt/dlami/nvme; then
echo "/opt/dlami/nvme is not mounted; refusing to run model workers on root disk" >&2
exit 1
fi
if [[ "$STOP_SGLANG_FOR_COMFY_POOL" == "1" ]]; then
sudo systemctl stop desineuron-sglang.service || true
fi
sudo systemctl stop comfyui.service || true
sudo systemctl disable comfyui.service || true
sudo install -m 0755 "$SCRIPT_DIR/desineuron-start-comfy-worker" /usr/local/bin/desineuron-start-comfy-worker
sudo install -m 0644 "$SCRIPT_DIR/comfyui-worker@.service" /etc/systemd/system/comfyui-worker@.service
sudo systemctl daemon-reload
for index in $(seq 0 "$((WORKERS - 1))"); do
sudo systemctl enable --now "comfyui-worker@${index}.service"
sudo systemctl restart "comfyui-worker@${index}.service"
done
sleep 5
for index in $(seq 0 "$((WORKERS - 1))"); do
port=$((8188 + index))
echo "worker ${index} http://127.0.0.1:${port}"
curl -fsS "http://127.0.0.1:${port}/models/checkpoints" | head -c 500
echo
done

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -euo pipefail
NVME_ROOT="${NVME_ROOT:-/opt/dlami/nvme/sglang}"
RUNTIME_ROOT="${RUNTIME_ROOT:-/opt/desineuron-sglang}"
VENV_PATH="${RUNTIME_ROOT}/.venv"
PORT="${SGLANG_PORT:-30100}"
HOST="${SGLANG_HOST:-}"
MODEL_ID="${SGLANG_MODEL_ID:-qwen3.6-35b-a3b}"
MODEL_PATH="${SGLANG_MODEL_PATH:-/opt/dlami/nvme/models/Qwen-Qwen3.6-35B-A3B-FP8}"
TP_SIZE="${SGLANG_TP_SIZE:-4}"
CONTEXT_LENGTH="${SGLANG_CONTEXT_LENGTH:-131072}"
MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_STATIC:-0.88}"
ATTENTION_BACKEND="${SGLANG_ATTENTION_BACKEND:-flashinfer}"
DIST_INIT_ADDR="${SGLANG_DIST_INIT_ADDR:-127.0.0.1:50000}"
if [[ -z "${HOST}" ]]; then
IMDS_TOKEN="$(curl -fsS -X PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600' || true)"
if [[ -n "${IMDS_TOKEN}" ]]; then
HOST="$(curl -fsS -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/local-ipv4 || true)"
fi
fi
if [[ -z "${HOST}" ]]; then
HOST="$(hostname -I | awk '{print $1}')"
fi
if [[ -z "${HOST}" ]]; then
echo "Unable to resolve GPU private IP for SGLang host binding" >&2
exit 1
fi
sudo mkdir -p "${NVME_ROOT}"/{cache,logs,state} "${RUNTIME_ROOT}"
python3 -m venv "${VENV_PATH}"
"${VENV_PATH}/bin/pip" install --upgrade pip wheel setuptools
"${VENV_PATH}/bin/pip" install "sglang[all]>=0.5.3" flashinfer-python huggingface_hub
sudo tee /etc/default/desineuron-sglang >/dev/null <<EOF
SGLANG_HOST=${HOST}
SGLANG_PORT=${PORT}
SGLANG_MODEL_ID=${MODEL_ID}
SGLANG_MODEL_PATH=${MODEL_PATH}
SGLANG_TP_SIZE=${TP_SIZE}
SGLANG_CONTEXT_LENGTH=${CONTEXT_LENGTH}
SGLANG_MEM_FRACTION_STATIC=${MEM_FRACTION_STATIC}
SGLANG_ATTENTION_BACKEND=${ATTENTION_BACKEND}
SGLANG_DIST_INIT_ADDR=${DIST_INIT_ADDR}
SGLANG_CACHE_DIR=${NVME_ROOT}/cache
SGLANG_LOG_DIR=${NVME_ROOT}/logs
SGLANG_STATE_DIR=${NVME_ROOT}/state
SGLANG_USE_FLASHINFER=1
SGLANG_ENABLE_PREFIX_CACHE=1
SGLANG_SERVED_MODEL_NAME=${MODEL_ID}
SGLANG_EXTRA_ARGS=
EOF
sudo chmod 600 /etc/default/desineuron-sglang
sudo tee /usr/local/bin/desineuron-sglang-launch.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
source /etc/default/desineuron-sglang
export HF_HOME="${SGLANG_CACHE_DIR}/hf"
export HUGGINGFACE_HUB_CACHE="${SGLANG_CACHE_DIR}/hf"
export CUDA_DEVICE_MAX_CONNECTIONS=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export SGLANG_USE_FLASHINFER="${SGLANG_USE_FLASHINFER}"
exec /opt/desineuron-sglang/.venv/bin/sglang serve \
--host "${SGLANG_HOST}" \
--port "${SGLANG_PORT}" \
--model-path "${SGLANG_MODEL_PATH}" \
--served-model-name "${SGLANG_SERVED_MODEL_NAME}" \
--tp-size "${SGLANG_TP_SIZE}" \
--context-length "${SGLANG_CONTEXT_LENGTH}" \
--mem-fraction-static "${SGLANG_MEM_FRACTION_STATIC}" \
--attention-backend "${SGLANG_ATTENTION_BACKEND}" \
--dist-init-addr "${SGLANG_DIST_INIT_ADDR}" \
--enable-metrics \
--skip-server-warmup \
${SGLANG_EXTRA_ARGS}
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-launch.sh
sudo tee /etc/systemd/system/desineuron-sglang.service >/dev/null <<EOF
[Unit]
Description=Desineuron SGLang Runtime
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=/etc/default/desineuron-sglang
WorkingDirectory=${RUNTIME_ROOT}
ExecStart=/usr/local/bin/desineuron-sglang-launch.sh
Restart=always
RestartSec=5
LimitNOFILE=1048576
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-sglang.service
sudo systemctl --no-pager --full status desineuron-sglang.service

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env bash
set -euo pipefail
sudo tee /usr/local/bin/desineuron-sglang-watchdog.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
source /etc/default/desineuron-sglang
HEALTH_URL="http://127.0.0.1:${SGLANG_PORT}/v1/models"
HYDRATE_HELPER="/usr/local/bin/desineuron-sglang-hydrate.sh"
STARTUP_GRACE_SECONDS="${SGLANG_STARTUP_GRACE_SECONDS:-900}"
HEALTH_TIMEOUT_SECONDS="${SGLANG_HEALTH_TIMEOUT_SECONDS:-60}"
if [[ ! -d "${SGLANG_MODEL_PATH}" ]]; then
"${HYDRATE_HELPER}" "${SGLANG_MODEL_ID}" "${SGLANG_MODEL_PATH}"
fi
if ! systemctl is-active --quiet desineuron-sglang.service; then
systemctl restart desineuron-sglang.service
sleep 10
fi
main_pid="$(systemctl show -p MainPID --value desineuron-sglang.service || true)"
if [[ -n "${main_pid}" && "${main_pid}" != "0" ]]; then
runtime_age="$(( $(date +%s) - $(stat -c %Y "/proc/${main_pid}" 2>/dev/null || date +%s) ))"
if (( runtime_age < STARTUP_GRACE_SECONDS )); then
echo "startup_grace"
exit 0
fi
fi
if ! curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null; then
systemctl restart desineuron-sglang.service
sleep 20
fi
curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null
echo "healthy"
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-watchdog.sh
sudo tee /usr/local/bin/desineuron-sglang-hydrate.sh >/dev/null <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
MODEL_ID="${1:?model id required}"
TARGET_PATH="${2:?target path required}"
mkdir -p "$(dirname "${TARGET_PATH}")"
if command -v hf >/dev/null 2>&1; then
hf download "${MODEL_ID}" --local-dir "${TARGET_PATH}" --max-workers 8
else
python3 - <<PY
from huggingface_hub import snapshot_download
snapshot_download(repo_id="${MODEL_ID}", local_dir="${TARGET_PATH}", max_workers=8)
PY
fi
EOF
sudo chmod 0755 /usr/local/bin/desineuron-sglang-hydrate.sh
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.service >/dev/null <<EOF
[Unit]
Description=Desineuron SGLang Runtime Watchdog
After=network-online.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/desineuron-sglang-watchdog.sh
EOF
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.timer >/dev/null <<EOF
[Unit]
Description=Run the Desineuron SGLang watchdog every 5 minutes
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
Unit=desineuron-sglang-watchdog.service
[Install]
WantedBy=timers.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-sglang-watchdog.timer
sudo systemctl start desineuron-sglang-watchdog.service
sudo systemctl --no-pager --full status desineuron-sglang-watchdog.timer