forked from sagnik/Project_Velocity
feat: Oracle Canvas, Revision History and Canvas Sharing (#33)
Co-authored-by: Sagnik <sagnik7896@gmail.com> Reviewed-on: sagnik/Project_Velocity#33
This commit is contained in:
@@ -25,6 +25,25 @@ office.desineuron.in, git.desineuron.in, cloud.desineuron.in, projects.desineuro
|
||||
}
|
||||
}
|
||||
|
||||
velocity.desineuron.in {
|
||||
log {
|
||||
output file /var/log/caddy/access.log
|
||||
format json
|
||||
}
|
||||
|
||||
import /etc/caddy/managed/llm_upstream.caddy_inc
|
||||
|
||||
reverse_proxy https://127.0.0.1:8443 {
|
||||
header_up Host {host}
|
||||
header_up X-Forwarded-Host {host}
|
||||
header_up X-Forwarded-Proto {scheme}
|
||||
header_up X-Forwarded-For {remote_host}
|
||||
transport http {
|
||||
tls_insecure_skip_verify
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ops.desineuron.in {
|
||||
log {
|
||||
output file /var/log/caddy/access.log
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
TARGET_PATH="${TARGET_PATH:-/opt/dlami/nvme/models/cyankiwi-Qwen3.5-122B-A10B-AWQ-4bit}"
|
||||
MODEL_REPO="${MODEL_REPO:-cyankiwi/Qwen3.5-122B-A10B-AWQ-4bit}"
|
||||
|
||||
mkdir -p "${TARGET_PATH}"
|
||||
|
||||
if command -v hf >/dev/null 2>&1; then
|
||||
hf download "${MODEL_REPO}" --local-dir "${TARGET_PATH}" --max-workers 8
|
||||
else
|
||||
python3 - <<PY
|
||||
from huggingface_hub import snapshot_download
|
||||
snapshot_download(repo_id="${MODEL_REPO}", local_dir="${TARGET_PATH}", max_workers=8)
|
||||
PY
|
||||
fi
|
||||
|
||||
echo "Staged ${MODEL_REPO} under ${TARGET_PATH}"
|
||||
echo "This is an acquisition/staging path only. The live L4 runtime remains qwen3.6:35b-a3b unless explicitly cut over."
|
||||
echo "Use MODEL_REPO=txn545/Qwen3.5-122B-A10B-NVFP4 only on hardware validated for NVFP4."
|
||||
17
infrastructure/desineuron_ingress/deploy_caddy_llm.sh
Normal file
17
infrastructure/desineuron_ingress/deploy_caddy_llm.sh
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Copy latest config files
|
||||
sudo scp -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem /tmp/manage_desineuron_routes.py ec2-user@98.87.120.120:/tmp/manage_desineuron_routes.py
|
||||
sudo scp -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem /tmp/Caddyfile ec2-user@98.87.120.120:/tmp/Caddyfile
|
||||
|
||||
# Bootstrap on the proxy target
|
||||
sudo ssh -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem ec2-user@98.87.120.120 "sudo cp /tmp/manage_desineuron_routes.py /usr/local/bin/manage_desineuron_routes.py && sudo chmod +x /usr/local/bin/manage_desineuron_routes.py && sudo touch /etc/caddy/managed/llm_upstream.caddy_inc && sudo cp /tmp/Caddyfile /etc/caddy/Caddyfile"
|
||||
|
||||
# Invoke immediate synchronization pulse to populate llm_upstream.caddy_inc
|
||||
sudo systemctl start desineuron-llm-route-sync.service
|
||||
|
||||
sleep 5
|
||||
|
||||
# Safely initiate proxy reload
|
||||
sudo ssh -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem ec2-user@98.87.120.120 "sudo systemctl reload caddy"
|
||||
@@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Sync llm.desineuron.in managed route to current GPU private IP
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
EnvironmentFile=/etc/desineuron-llm-route-sync.env
|
||||
ExecStart=/usr/local/bin/run_llm_route_sync.sh
|
||||
@@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Run LLM route sync on boot and every 2 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=1min
|
||||
OnUnitActiveSec=2min
|
||||
Unit=desineuron-llm-route-sync.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
108
infrastructure/desineuron_ingress/install_gpu_ollama_watchdog.sh
Normal file
108
infrastructure/desineuron_ingress/install_gpu_ollama_watchdog.sh
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
MODEL_NAME="qwen3.6:35b-a3b"
|
||||
NVME_ROOT="/opt/dlami/nvme/ollama"
|
||||
OLLAMA_OVERRIDE_DIR="/etc/systemd/system/ollama.service.d"
|
||||
|
||||
# 1. Configure Ollama to use NVME
|
||||
sudo mkdir -p "${NVME_ROOT}/models" "${NVME_ROOT}/state" "${NVME_ROOT}/logs"
|
||||
sudo chown -R root:root "${NVME_ROOT}"
|
||||
|
||||
echo "Configuring Ollama to use NVME storage at ${NVME_ROOT}/models..."
|
||||
sudo mkdir -p "${OLLAMA_OVERRIDE_DIR}"
|
||||
sudo tee "${OLLAMA_OVERRIDE_DIR}/override.conf" >/dev/null <<EOF
|
||||
[Service]
|
||||
Environment="OLLAMA_MODELS=${NVME_ROOT}/models"
|
||||
Environment="OLLAMA_HOST=0.0.0.0"
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now ollama.service
|
||||
|
||||
# 2. Write the Hydrate Helper
|
||||
HYDRATE_HELPER="/usr/local/bin/desineuron-hydrate-qwen36.sh"
|
||||
echo "Creating Hydrate Helper map at $HYDRATE_HELPER"
|
||||
sudo tee "$HYDRATE_HELPER" >/dev/null <<EOF
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "(\$(date)) Hydrating \$1 model using ollama pull..." | sudo tee -a "${NVME_ROOT}/logs/qwen36_hydrate.log"
|
||||
# This requires outward access or an Ollama compatible registry proxy
|
||||
# Note: For S3-based private GGUFs, this would use s5cmd
|
||||
ollama pull "\$1"
|
||||
echo "(\$(date)) Hydration complete" | sudo tee -a "${NVME_ROOT}/logs/qwen36_hydrate.log"
|
||||
EOF
|
||||
sudo chmod 0755 "$HYDRATE_HELPER"
|
||||
|
||||
# 3. Write Watchdog Script
|
||||
WATCHDOG_SCRIPT="/usr/local/bin/desineuron-ollama-watchdog.sh"
|
||||
echo "Creating Watchdog Script map at $WATCHDOG_SCRIPT"
|
||||
sudo tee "$WATCHDOG_SCRIPT" >/dev/null <<EOF
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
MODEL_NAME="${MODEL_NAME}"
|
||||
OLLAMA_URL="http://127.0.0.1:11434"
|
||||
|
||||
if ! systemctl is-active --quiet ollama; then
|
||||
systemctl restart ollama
|
||||
sleep 5
|
||||
fi
|
||||
|
||||
# Try asking Ollama if the tag exists
|
||||
if ! curl -fsS "\$OLLAMA_URL/api/tags" | grep -q "\$MODEL_NAME"; then
|
||||
echo "Expected model \$MODEL_NAME missing. Initiating hydration..."
|
||||
|
||||
# Ensure wiped ephemeral NVMe disks are scaffolded pre-hydration
|
||||
sudo mkdir -p "${NVME_ROOT}/logs" "${NVME_ROOT}/models" "${NVME_ROOT}/state"
|
||||
sudo chown -R ollama:ollama "${NVME_ROOT}"
|
||||
|
||||
/usr/local/bin/desineuron-hydrate-qwen36.sh "\$MODEL_NAME"
|
||||
sleep 5
|
||||
fi
|
||||
|
||||
# Verify final state
|
||||
if curl -fsS "\$OLLAMA_URL/api/tags" | grep -q "\$MODEL_NAME"; then
|
||||
echo "healthy"
|
||||
exit 0
|
||||
else
|
||||
echo "unhealthy: Model \$MODEL_NAME failed to register" >&2
|
||||
exit 1
|
||||
fi
|
||||
EOF
|
||||
sudo chmod 0755 "$WATCHDOG_SCRIPT"
|
||||
|
||||
|
||||
# 4. Write Watchdog Systemd Service & Timer
|
||||
sudo tee "/etc/systemd/system/desineuron-ollama-watchdog.service" >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Desineuron GPU Ollama Watchdog for Model $MODEL_NAME
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Environment="HOME=/root"
|
||||
ExecStart=$WATCHDOG_SCRIPT
|
||||
EOF
|
||||
|
||||
sudo tee "/etc/systemd/system/desineuron-ollama-watchdog.timer" >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Watchdog run for Ollama Model $MODEL_NAME every 5 mins
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=5min
|
||||
Unit=desineuron-ollama-watchdog.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now desineuron-ollama-watchdog.timer
|
||||
sudo systemctl start desineuron-ollama-watchdog.service
|
||||
|
||||
echo "Ollama Watchdog installed and model $MODEL_NAME setup initiated."
|
||||
sudo systemctl --no-pager status desineuron-ollama-watchdog.timer
|
||||
|
||||
104
infrastructure/desineuron_ingress/install_gpu_sglang_runtime.sh
Normal file
104
infrastructure/desineuron_ingress/install_gpu_sglang_runtime.sh
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NVME_ROOT="${NVME_ROOT:-/opt/dlami/nvme/sglang}"
|
||||
RUNTIME_ROOT="${RUNTIME_ROOT:-/opt/desineuron-sglang}"
|
||||
VENV_PATH="${RUNTIME_ROOT}/.venv"
|
||||
PORT="${SGLANG_PORT:-30100}"
|
||||
HOST="${SGLANG_HOST:-}"
|
||||
MODEL_ID="${SGLANG_MODEL_ID:-qwen3.6-35b-a3b}"
|
||||
MODEL_PATH="${SGLANG_MODEL_PATH:-/opt/dlami/nvme/models/Qwen-Qwen3.6-35B-A3B-FP8}"
|
||||
TP_SIZE="${SGLANG_TP_SIZE:-4}"
|
||||
CONTEXT_LENGTH="${SGLANG_CONTEXT_LENGTH:-131072}"
|
||||
MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_STATIC:-0.88}"
|
||||
ATTENTION_BACKEND="${SGLANG_ATTENTION_BACKEND:-flashinfer}"
|
||||
DIST_INIT_ADDR="${SGLANG_DIST_INIT_ADDR:-127.0.0.1:50000}"
|
||||
|
||||
if [[ -z "${HOST}" ]]; then
|
||||
IMDS_TOKEN="$(curl -fsS -X PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600' || true)"
|
||||
if [[ -n "${IMDS_TOKEN}" ]]; then
|
||||
HOST="$(curl -fsS -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/local-ipv4 || true)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${HOST}" ]]; then
|
||||
HOST="$(hostname -I | awk '{print $1}')"
|
||||
fi
|
||||
|
||||
if [[ -z "${HOST}" ]]; then
|
||||
echo "Unable to resolve GPU private IP for SGLang host binding" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sudo mkdir -p "${NVME_ROOT}"/{cache,logs,state} "${RUNTIME_ROOT}"
|
||||
python3 -m venv "${VENV_PATH}"
|
||||
"${VENV_PATH}/bin/pip" install --upgrade pip wheel setuptools
|
||||
"${VENV_PATH}/bin/pip" install "sglang[all]>=0.5.3" flashinfer-python huggingface_hub
|
||||
|
||||
sudo tee /etc/default/desineuron-sglang >/dev/null <<EOF
|
||||
SGLANG_HOST=${HOST}
|
||||
SGLANG_PORT=${PORT}
|
||||
SGLANG_MODEL_ID=${MODEL_ID}
|
||||
SGLANG_MODEL_PATH=${MODEL_PATH}
|
||||
SGLANG_TP_SIZE=${TP_SIZE}
|
||||
SGLANG_CONTEXT_LENGTH=${CONTEXT_LENGTH}
|
||||
SGLANG_MEM_FRACTION_STATIC=${MEM_FRACTION_STATIC}
|
||||
SGLANG_ATTENTION_BACKEND=${ATTENTION_BACKEND}
|
||||
SGLANG_DIST_INIT_ADDR=${DIST_INIT_ADDR}
|
||||
SGLANG_CACHE_DIR=${NVME_ROOT}/cache
|
||||
SGLANG_LOG_DIR=${NVME_ROOT}/logs
|
||||
SGLANG_STATE_DIR=${NVME_ROOT}/state
|
||||
SGLANG_USE_FLASHINFER=1
|
||||
SGLANG_ENABLE_PREFIX_CACHE=1
|
||||
SGLANG_SERVED_MODEL_NAME=${MODEL_ID}
|
||||
SGLANG_EXTRA_ARGS=
|
||||
EOF
|
||||
sudo chmod 600 /etc/default/desineuron-sglang
|
||||
|
||||
sudo tee /usr/local/bin/desineuron-sglang-launch.sh >/dev/null <<'EOF'
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
source /etc/default/desineuron-sglang
|
||||
export HF_HOME="${SGLANG_CACHE_DIR}/hf"
|
||||
export HUGGINGFACE_HUB_CACHE="${SGLANG_CACHE_DIR}/hf"
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
export SGLANG_USE_FLASHINFER="${SGLANG_USE_FLASHINFER}"
|
||||
exec /opt/desineuron-sglang/.venv/bin/sglang serve \
|
||||
--host "${SGLANG_HOST}" \
|
||||
--port "${SGLANG_PORT}" \
|
||||
--model-path "${SGLANG_MODEL_PATH}" \
|
||||
--served-model-name "${SGLANG_SERVED_MODEL_NAME}" \
|
||||
--tp-size "${SGLANG_TP_SIZE}" \
|
||||
--context-length "${SGLANG_CONTEXT_LENGTH}" \
|
||||
--mem-fraction-static "${SGLANG_MEM_FRACTION_STATIC}" \
|
||||
--attention-backend "${SGLANG_ATTENTION_BACKEND}" \
|
||||
--dist-init-addr "${SGLANG_DIST_INIT_ADDR}" \
|
||||
--enable-metrics \
|
||||
--skip-server-warmup \
|
||||
${SGLANG_EXTRA_ARGS}
|
||||
EOF
|
||||
sudo chmod 0755 /usr/local/bin/desineuron-sglang-launch.sh
|
||||
|
||||
sudo tee /etc/systemd/system/desineuron-sglang.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Desineuron SGLang Runtime
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=/etc/default/desineuron-sglang
|
||||
WorkingDirectory=${RUNTIME_ROOT}
|
||||
ExecStart=/usr/local/bin/desineuron-sglang-launch.sh
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
LimitNOFILE=1048576
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now desineuron-sglang.service
|
||||
sudo systemctl --no-pager --full status desineuron-sglang.service
|
||||
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
sudo tee /usr/local/bin/desineuron-sglang-watchdog.sh >/dev/null <<'EOF'
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
source /etc/default/desineuron-sglang
|
||||
|
||||
HEALTH_URL="http://127.0.0.1:${SGLANG_PORT}/v1/models"
|
||||
HYDRATE_HELPER="/usr/local/bin/desineuron-sglang-hydrate.sh"
|
||||
STARTUP_GRACE_SECONDS="${SGLANG_STARTUP_GRACE_SECONDS:-900}"
|
||||
HEALTH_TIMEOUT_SECONDS="${SGLANG_HEALTH_TIMEOUT_SECONDS:-60}"
|
||||
|
||||
if [[ ! -d "${SGLANG_MODEL_PATH}" ]]; then
|
||||
"${HYDRATE_HELPER}" "${SGLANG_MODEL_ID}" "${SGLANG_MODEL_PATH}"
|
||||
fi
|
||||
|
||||
if ! systemctl is-active --quiet desineuron-sglang.service; then
|
||||
systemctl restart desineuron-sglang.service
|
||||
sleep 10
|
||||
fi
|
||||
|
||||
main_pid="$(systemctl show -p MainPID --value desineuron-sglang.service || true)"
|
||||
if [[ -n "${main_pid}" && "${main_pid}" != "0" ]]; then
|
||||
runtime_age="$(( $(date +%s) - $(stat -c %Y "/proc/${main_pid}" 2>/dev/null || date +%s) ))"
|
||||
if (( runtime_age < STARTUP_GRACE_SECONDS )); then
|
||||
echo "startup_grace"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null; then
|
||||
systemctl restart desineuron-sglang.service
|
||||
sleep 20
|
||||
fi
|
||||
|
||||
curl --max-time "${HEALTH_TIMEOUT_SECONDS}" -fsS "${HEALTH_URL}" >/dev/null
|
||||
echo "healthy"
|
||||
EOF
|
||||
sudo chmod 0755 /usr/local/bin/desineuron-sglang-watchdog.sh
|
||||
|
||||
sudo tee /usr/local/bin/desineuron-sglang-hydrate.sh >/dev/null <<'EOF'
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
MODEL_ID="${1:?model id required}"
|
||||
TARGET_PATH="${2:?target path required}"
|
||||
mkdir -p "$(dirname "${TARGET_PATH}")"
|
||||
if command -v hf >/dev/null 2>&1; then
|
||||
hf download "${MODEL_ID}" --local-dir "${TARGET_PATH}" --max-workers 8
|
||||
else
|
||||
python3 - <<PY
|
||||
from huggingface_hub import snapshot_download
|
||||
snapshot_download(repo_id="${MODEL_ID}", local_dir="${TARGET_PATH}", max_workers=8)
|
||||
PY
|
||||
fi
|
||||
EOF
|
||||
sudo chmod 0755 /usr/local/bin/desineuron-sglang-hydrate.sh
|
||||
|
||||
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.service >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Desineuron SGLang Runtime Watchdog
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/desineuron-sglang-watchdog.sh
|
||||
EOF
|
||||
|
||||
sudo tee /etc/systemd/system/desineuron-sglang-watchdog.timer >/dev/null <<EOF
|
||||
[Unit]
|
||||
Description=Run the Desineuron SGLang watchdog every 5 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=2min
|
||||
OnUnitActiveSec=5min
|
||||
Unit=desineuron-sglang-watchdog.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now desineuron-sglang-watchdog.timer
|
||||
sudo systemctl start desineuron-sglang-watchdog.service
|
||||
sudo systemctl --no-pager --full status desineuron-sglang-watchdog.timer
|
||||
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
APP_ROOT=/opt/desineuron-llm-route-sync
|
||||
VENV_PATH="$APP_ROOT/.venv"
|
||||
ENV_FILE=/etc/desineuron-llm-route-sync.env
|
||||
SCRIPT_PATH=/usr/local/bin/sync_llm_route.py
|
||||
WRAPPER_PATH=/usr/local/bin/run_llm_route_sync.sh
|
||||
SERVICE_FILE=/etc/systemd/system/desineuron-llm-route-sync.service
|
||||
TIMER_FILE=/etc/systemd/system/desineuron-llm-route-sync.timer
|
||||
|
||||
sudo mkdir -p "$APP_ROOT" /var/lib/desineuron-llm-route-sync
|
||||
python3 -m venv "$VENV_PATH"
|
||||
"$VENV_PATH/bin/pip" install --upgrade pip boto3
|
||||
|
||||
sudo install -m 0755 /tmp/desineuron_ingress/sync_llm_route.py "$SCRIPT_PATH"
|
||||
sudo install -m 0755 /tmp/desineuron_ingress/run_llm_route_sync.sh "$WRAPPER_PATH"
|
||||
sudo install -m 0644 /tmp/desineuron_ingress/desineuron-llm-route-sync.service "$SERVICE_FILE"
|
||||
sudo install -m 0644 /tmp/desineuron_ingress/desineuron-llm-route-sync.timer "$TIMER_FILE"
|
||||
|
||||
sudo tee "$ENV_FILE" >/dev/null <<EOF
|
||||
OPS_ENV_FILE=/opt/desineuron-ops-control-plane/.env
|
||||
LLM_ROUTE_HOSTNAME=llm.desineuron.in
|
||||
LLM_ROUTE_PORT=30100
|
||||
LLM_INSTANCE_TAG_KEY=DesineuronRole
|
||||
LLM_INSTANCE_TAG_VALUE=comfyui
|
||||
LLM_ROUTE_STATE_FILE=/var/lib/desineuron-llm-route-sync/current_target.txt
|
||||
INGRESS_SSH_KEY_PATH=/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem
|
||||
EOF
|
||||
|
||||
sudo chmod 600 "$ENV_FILE"
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now desineuron-llm-route-sync.timer
|
||||
sudo systemctl start desineuron-llm-route-sync.service
|
||||
sudo systemctl --no-pager --full status desineuron-llm-route-sync.service desineuron-llm-route-sync.timer
|
||||
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
STATE_FILE = Path("/etc/caddy/managed/desineuron-routes.json")
|
||||
SNIPPET_FILE = Path("/etc/caddy/managed/desineuron-routes.caddy")
|
||||
|
||||
|
||||
def load_routes() -> dict[str, dict]:
|
||||
if STATE_FILE.exists():
|
||||
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
|
||||
return {}
|
||||
|
||||
|
||||
def save_routes(routes: dict[str, dict]) -> None:
|
||||
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
STATE_FILE.write_text(json.dumps(routes, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def render_routes(routes: dict[str, dict]) -> None:
|
||||
lines: list[str] = []
|
||||
for hostname, route in sorted(routes.items()):
|
||||
lines.extend(
|
||||
[
|
||||
f"{hostname} {{",
|
||||
"\ttls /etc/caddy/tls/fullchain.pem /etc/caddy/tls/privkey.pem",
|
||||
"\tlog {",
|
||||
"\t\toutput file /var/log/caddy/access.log",
|
||||
"\t\tformat json",
|
||||
"\t}",
|
||||
f"\treverse_proxy {route['scheme']}://{route['target_host']}:{route['target_port']} {{",
|
||||
"\t\theader_up Host {host}",
|
||||
"\t\theader_up X-Forwarded-Host {host}",
|
||||
"\t\theader_up X-Forwarded-Proto {scheme}",
|
||||
"\t\theader_up X-Forwarded-For {remote_host}",
|
||||
"\t}",
|
||||
"}",
|
||||
"",
|
||||
]
|
||||
)
|
||||
SNIPPET_FILE.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
|
||||
|
||||
# Generate a dedicated upstream include exclusively for velocity.desineuron.in/llm
|
||||
llm_inc = Path("/etc/caddy/managed/llm_upstream.caddy_inc")
|
||||
if "llm.desineuron.in" in routes:
|
||||
route = routes["llm.desineuron.in"]
|
||||
llm_inc.write_text(
|
||||
f"handle_path /llm/* {{\n"
|
||||
f"\treverse_proxy {route['scheme']}://{route['target_host']}:{route['target_port']} {{\n"
|
||||
f"\t\theader_up Host {{host}}\n"
|
||||
f"\t\theader_up X-Forwarded-For {{remote_host}}\n"
|
||||
f"\t\tflush_interval -1\n"
|
||||
f"\t\theader_down X-Accel-Buffering no\n"
|
||||
f"\t}}\n"
|
||||
f"}}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
else:
|
||||
llm_inc.write_text("", encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if len(sys.argv) < 2:
|
||||
print("usage: manage_desineuron_routes.py <upsert|delete|list> [payload|hostname]")
|
||||
return 1
|
||||
command = sys.argv[1]
|
||||
routes = load_routes()
|
||||
if command == "upsert":
|
||||
payload = json.loads(sys.argv[2])
|
||||
routes[payload["hostname"]] = payload
|
||||
save_routes(routes)
|
||||
render_routes(routes)
|
||||
print(json.dumps({"status": "ok", "action": "upsert", "hostname": payload["hostname"]}))
|
||||
return 0
|
||||
if command == "delete":
|
||||
hostname = sys.argv[2]
|
||||
routes.pop(hostname, None)
|
||||
save_routes(routes)
|
||||
render_routes(routes)
|
||||
print(json.dumps({"status": "ok", "action": "delete", "hostname": hostname}))
|
||||
return 0
|
||||
if command == "list":
|
||||
print(json.dumps(routes, indent=2))
|
||||
return 0
|
||||
print(f"unknown command: {command}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,34 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
$gpuGroups = @(
|
||||
"sg-0b144c17b1b89f4c6",
|
||||
"sg-05e4de3fe94ad6558"
|
||||
)
|
||||
|
||||
$ingressGroup = "sg-0721b8b48e12c531d"
|
||||
|
||||
try {
|
||||
aws ec2 authorize-security-group-ingress `
|
||||
--group-id "sg-0b144c17b1b89f4c6" `
|
||||
--protocol tcp --port 11434 `
|
||||
--source-group $ingressGroup | Out-Null
|
||||
} catch {
|
||||
}
|
||||
|
||||
foreach ($group in $gpuGroups) {
|
||||
foreach ($port in 11434) {
|
||||
try {
|
||||
aws ec2 revoke-security-group-ingress `
|
||||
--group-id $group `
|
||||
--protocol tcp `
|
||||
--port $port `
|
||||
--cidr 0.0.0.0/0 | Out-Null
|
||||
} catch {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
aws ec2 describe-security-groups `
|
||||
--group-ids $gpuGroups `
|
||||
--query "SecurityGroups[].{GroupId:GroupId,GroupName:GroupName,Ingress:IpPermissions}" `
|
||||
--output json
|
||||
13
infrastructure/desineuron_ingress/run_llm_route_sync.sh
Normal file
13
infrastructure/desineuron_ingress/run_llm_route_sync.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
APP_ROOT=/opt/desineuron-llm-route-sync
|
||||
SCRIPT_PATH=/usr/local/bin/sync_llm_route.py
|
||||
VENV_PYTHON="$APP_ROOT/.venv/bin/python"
|
||||
|
||||
if [[ ! -x "$VENV_PYTHON" ]]; then
|
||||
echo "Missing route-sync venv python at $VENV_PYTHON" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exec "$VENV_PYTHON" "$SCRIPT_PATH"
|
||||
42
infrastructure/desineuron_ingress/start_gpu.py
Normal file
42
infrastructure/desineuron_ingress/start_gpu.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import boto3, os, time
|
||||
from pathlib import Path
|
||||
d={}
|
||||
for l in Path('/opt/desineuron-ops-control-plane/.env').read_text().splitlines():
|
||||
if '=' in l and not l.startswith('#'):
|
||||
k,v=l.split('=',1)
|
||||
d[k.strip()]=v.strip()
|
||||
os.environ['AWS_ACCESS_KEY_ID']=d.get('AWS_ACCESS_KEY_ID','')
|
||||
os.environ['AWS_SECRET_ACCESS_KEY']=d.get('AWS_SECRET_ACCESS_KEY','')
|
||||
ec2=boto3.client('ec2', region_name='us-east-1')
|
||||
|
||||
def get_gpu():
|
||||
for r in ec2.describe_instances()['Reservations']:
|
||||
for i in r['Instances']:
|
||||
if any(t['Key'] == 'Name' and t['Value'] == 'desineuron-comfy-gpu' for t in i.get('Tags', [])):
|
||||
return i
|
||||
return None
|
||||
|
||||
def main():
|
||||
while True:
|
||||
i = get_gpu()
|
||||
if not i:
|
||||
print('Not found')
|
||||
break
|
||||
state = i['State']['Name']
|
||||
print(f"Instance {i['InstanceId']} is {state}")
|
||||
if state == 'stopped':
|
||||
print('Starting instance...')
|
||||
ec2.start_instances(InstanceIds=[i['InstanceId']])
|
||||
time.sleep(5)
|
||||
elif state == 'stopping':
|
||||
print('Waiting for extremely aggressive stop sequence gracefully...')
|
||||
time.sleep(10)
|
||||
elif state == 'running':
|
||||
print('Instance successfully running payload on IP:', i.get('PrivateIpAddress'))
|
||||
break
|
||||
else:
|
||||
print('Waiting eagerly...')
|
||||
time.sleep(10)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
152
infrastructure/desineuron_ingress/sync_llm_route.py
Normal file
152
infrastructure/desineuron_ingress/sync_llm_route.py
Normal file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
def load_env_file(path: Path) -> dict[str, str]:
|
||||
data: dict[str, str] = {}
|
||||
if not path.exists():
|
||||
return data
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
data[key.strip()] = value.strip()
|
||||
return data
|
||||
|
||||
|
||||
def env(name: str, default: str = "") -> str:
|
||||
return os.environ.get(name, default)
|
||||
|
||||
|
||||
def resolve_target_instance(ec2) -> dict | None:
|
||||
explicit_instance_id = env("LLM_INSTANCE_ID")
|
||||
if explicit_instance_id:
|
||||
reservations = ec2.describe_instances(InstanceIds=[explicit_instance_id])["Reservations"]
|
||||
for reservation in reservations:
|
||||
for instance in reservation["Instances"]:
|
||||
if instance["State"]["Name"] == "running":
|
||||
return instance
|
||||
return None
|
||||
|
||||
# We assume the LLM runtime runs on the same GPU instance as comfyui initially
|
||||
tag_key = env("LLM_INSTANCE_TAG_KEY", "DesineuronRole")
|
||||
tag_value = env("LLM_INSTANCE_TAG_VALUE", "comfyui")
|
||||
filters = [
|
||||
{"Name": "instance-state-name", "Values": ["running"]},
|
||||
{"Name": f"tag:{tag_key}", "Values": [tag_value]},
|
||||
]
|
||||
reservations = ec2.describe_instances(Filters=filters)["Reservations"]
|
||||
instances = [instance for reservation in reservations for instance in reservation["Instances"]]
|
||||
if not instances:
|
||||
return None
|
||||
instances.sort(key=lambda row: row["LaunchTime"], reverse=True)
|
||||
return instances[0]
|
||||
|
||||
|
||||
def upsert_route(hostname: str, private_ip: str, port: int) -> subprocess.CompletedProcess[str]:
|
||||
ingress_host = env("INGRESS_SSH_HOST")
|
||||
ingress_user = env("INGRESS_SSH_USER", "ec2-user")
|
||||
ingress_port = env("INGRESS_SSH_PORT", "22")
|
||||
ingress_key = env("INGRESS_SSH_KEY_PATH")
|
||||
helper = env("INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
|
||||
payload = json.dumps(
|
||||
{
|
||||
"hostname": hostname,
|
||||
"scheme": "http",
|
||||
"target_host": private_ip,
|
||||
"target_port": port,
|
||||
}
|
||||
)
|
||||
command = (
|
||||
f"sudo {helper} upsert '{payload}'"
|
||||
" && sudo caddy validate --config /etc/caddy/Caddyfile"
|
||||
" && sudo systemctl reload caddy"
|
||||
)
|
||||
return subprocess.run(
|
||||
[
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-o",
|
||||
"UserKnownHostsFile=/dev/null",
|
||||
"-i",
|
||||
ingress_key,
|
||||
"-p",
|
||||
ingress_port,
|
||||
f"{ingress_user}@{ingress_host}",
|
||||
command,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ops_env = load_env_file(Path(env("OPS_ENV_FILE", "/opt/desineuron-ops-control-plane/.env")))
|
||||
for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]:
|
||||
if key not in os.environ and key in ops_env:
|
||||
os.environ[key] = ops_env[key]
|
||||
os.environ.setdefault("AWS_DEFAULT_REGION", ops_env.get("OPS_DEFAULT_REGION", "us-east-1"))
|
||||
os.environ.setdefault("INGRESS_SSH_HOST", ops_env.get("OPS_INGRESS_SSH_HOST", ""))
|
||||
os.environ.setdefault("INGRESS_SSH_USER", ops_env.get("OPS_INGRESS_SSH_USER", "ec2-user"))
|
||||
os.environ.setdefault("INGRESS_SSH_PORT", ops_env.get("OPS_INGRESS_SSH_PORT", "22"))
|
||||
normalized_key_path = ops_env.get("OPS_SSH_KEY_PATH", "/opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem")
|
||||
if normalized_key_path.startswith("/app/state/"):
|
||||
normalized_key_path = normalized_key_path.replace("/app/state/", "/opt/desineuron-ops-control-plane/state/")
|
||||
os.environ.setdefault("INGRESS_SSH_KEY_PATH", normalized_key_path)
|
||||
os.environ.setdefault("INGRESS_ROUTE_HELPER", ops_env.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py"))
|
||||
|
||||
region = os.environ["AWS_DEFAULT_REGION"]
|
||||
hostname = env("LLM_ROUTE_HOSTNAME", "llm.desineuron.in")
|
||||
port = int(env("LLM_ROUTE_PORT", "11434"))
|
||||
state_file = Path(env("LLM_ROUTE_STATE_FILE", "/var/lib/desineuron-llm-route-sync/current_target.txt"))
|
||||
|
||||
ec2 = boto3.client("ec2", region_name=region)
|
||||
instance = resolve_target_instance(ec2)
|
||||
if not instance:
|
||||
print("No running LLM target instance found", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
private_ip = instance.get("PrivateIpAddress")
|
||||
if not private_ip:
|
||||
print("Target instance has no private IP", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
desired_state = f"{private_ip}:{port}"
|
||||
current = state_file.read_text(encoding="utf-8").strip() if state_file.exists() else ""
|
||||
if current == desired_state:
|
||||
print(
|
||||
json.dumps(
|
||||
{"status": "noop", "hostname": hostname, "target_host": private_ip, "target_port": port}
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
result = upsert_route(hostname, private_ip, port)
|
||||
if result.returncode != 0:
|
||||
print(result.stdout)
|
||||
print(result.stderr, file=sys.stderr)
|
||||
return result.returncode
|
||||
|
||||
state_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
state_file.write_text(desired_state, encoding="utf-8")
|
||||
print(
|
||||
json.dumps(
|
||||
{"status": "updated", "hostname": hostname, "target_host": private_ip, "target_port": port}
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
21
infrastructure/desineuron_ingress/update_ingress_tls.sh
Normal file
21
infrastructure/desineuron_ingress/update_ingress_tls.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Push the Caddyfile configuration
|
||||
sudo scp -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem /tmp/Caddyfile ec2-user@98.87.120.120:/tmp/Caddyfile
|
||||
sudo ssh -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem ec2-user@98.87.120.120 'sudo cp /tmp/Caddyfile /etc/caddy/Caddyfile'
|
||||
|
||||
# Fix cloudflare token
|
||||
sudo mkdir -p /etc/letsencrypt/.secrets/
|
||||
echo "dns_cloudflare_api_token = O1CyZ45txLgTXu04KAGTJmZ6CENZZtQIlIxUMXVL" | sudo tee /etc/letsencrypt/.secrets/cloudflare.ini > /dev/null
|
||||
sudo chmod 600 /etc/letsencrypt/.secrets/cloudflare.ini
|
||||
|
||||
# Renew and expand Let's Encrypt certificates locally on velocity-linux utilizing cloudflare dns
|
||||
sudo certbot certonly --cert-name desineuron-infra --dns-cloudflare --dns-cloudflare-credentials /etc/letsencrypt/.secrets/cloudflare.ini -d '*.desineuron.in' -d desineuron.in --expand --non-interactive --agree-tos
|
||||
|
||||
# Copy the fresh certs directly to the proxy substrate
|
||||
sudo scp -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem /etc/letsencrypt/live/desineuron-infra/fullchain.pem ec2-user@98.87.120.120:/tmp/fullchain.pem
|
||||
sudo scp -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem /etc/letsencrypt/live/desineuron-infra/privkey.pem ec2-user@98.87.120.120:/tmp/privkey.pem
|
||||
|
||||
# Apply to Caddy
|
||||
sudo ssh -o StrictHostKeyChecking=no -i /opt/desineuron-ops-control-plane/state/desineuron-l4-node.pem ec2-user@98.87.120.120 'sudo cp /tmp/fullchain.pem /etc/caddy/tls/fullchain.pem && sudo cp /tmp/privkey.pem /etc/caddy/tls/privkey.pem && sudo systemctl reload caddy'
|
||||
@@ -11,6 +11,17 @@ server {
|
||||
access_log /var/log/nginx/velocity.desineuron.in.access.log;
|
||||
error_log /var/log/nginx/velocity.desineuron.in.error.log;
|
||||
|
||||
location /api/ {
|
||||
proxy_pass http://127.0.0.1:8001;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user