105 lines
3.5 KiB
Bash
105 lines
3.5 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
NVME_ROOT="${NVME_ROOT:-/opt/dlami/nvme/sglang}"
|
|
RUNTIME_ROOT="${RUNTIME_ROOT:-/opt/desineuron-sglang}"
|
|
VENV_PATH="${RUNTIME_ROOT}/.venv"
|
|
PORT="${SGLANG_PORT:-30100}"
|
|
HOST="${SGLANG_HOST:-}"
|
|
MODEL_ID="${SGLANG_MODEL_ID:-qwen3.6-35b-a3b}"
|
|
MODEL_PATH="${SGLANG_MODEL_PATH:-/opt/dlami/nvme/models/Qwen-Qwen3.6-35B-A3B-FP8}"
|
|
TP_SIZE="${SGLANG_TP_SIZE:-4}"
|
|
CONTEXT_LENGTH="${SGLANG_CONTEXT_LENGTH:-131072}"
|
|
MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_STATIC:-0.88}"
|
|
ATTENTION_BACKEND="${SGLANG_ATTENTION_BACKEND:-flashinfer}"
|
|
DIST_INIT_ADDR="${SGLANG_DIST_INIT_ADDR:-127.0.0.1:50000}"
|
|
|
|
if [[ -z "${HOST}" ]]; then
|
|
IMDS_TOKEN="$(curl -fsS -X PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600' || true)"
|
|
if [[ -n "${IMDS_TOKEN}" ]]; then
|
|
HOST="$(curl -fsS -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/local-ipv4 || true)"
|
|
fi
|
|
fi
|
|
|
|
if [[ -z "${HOST}" ]]; then
|
|
HOST="$(hostname -I | awk '{print $1}')"
|
|
fi
|
|
|
|
if [[ -z "${HOST}" ]]; then
|
|
echo "Unable to resolve GPU private IP for SGLang host binding" >&2
|
|
exit 1
|
|
fi
|
|
|
|
sudo mkdir -p "${NVME_ROOT}"/{cache,logs,state} "${RUNTIME_ROOT}"
|
|
python3 -m venv "${VENV_PATH}"
|
|
"${VENV_PATH}/bin/pip" install --upgrade pip wheel setuptools
|
|
"${VENV_PATH}/bin/pip" install "sglang[all]>=0.5.3" flashinfer-python huggingface_hub
|
|
|
|
sudo tee /etc/default/desineuron-sglang >/dev/null <<EOF
|
|
SGLANG_HOST=${HOST}
|
|
SGLANG_PORT=${PORT}
|
|
SGLANG_MODEL_ID=${MODEL_ID}
|
|
SGLANG_MODEL_PATH=${MODEL_PATH}
|
|
SGLANG_TP_SIZE=${TP_SIZE}
|
|
SGLANG_CONTEXT_LENGTH=${CONTEXT_LENGTH}
|
|
SGLANG_MEM_FRACTION_STATIC=${MEM_FRACTION_STATIC}
|
|
SGLANG_ATTENTION_BACKEND=${ATTENTION_BACKEND}
|
|
SGLANG_DIST_INIT_ADDR=${DIST_INIT_ADDR}
|
|
SGLANG_CACHE_DIR=${NVME_ROOT}/cache
|
|
SGLANG_LOG_DIR=${NVME_ROOT}/logs
|
|
SGLANG_STATE_DIR=${NVME_ROOT}/state
|
|
SGLANG_USE_FLASHINFER=1
|
|
SGLANG_ENABLE_PREFIX_CACHE=1
|
|
SGLANG_SERVED_MODEL_NAME=${MODEL_ID}
|
|
SGLANG_EXTRA_ARGS=
|
|
EOF
|
|
sudo chmod 600 /etc/default/desineuron-sglang
|
|
|
|
sudo tee /usr/local/bin/desineuron-sglang-launch.sh >/dev/null <<'EOF'
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
source /etc/default/desineuron-sglang
|
|
export HF_HOME="${SGLANG_CACHE_DIR}/hf"
|
|
export HUGGINGFACE_HUB_CACHE="${SGLANG_CACHE_DIR}/hf"
|
|
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
|
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
export SGLANG_USE_FLASHINFER="${SGLANG_USE_FLASHINFER}"
|
|
exec /opt/desineuron-sglang/.venv/bin/sglang serve \
|
|
--host "${SGLANG_HOST}" \
|
|
--port "${SGLANG_PORT}" \
|
|
--model-path "${SGLANG_MODEL_PATH}" \
|
|
--served-model-name "${SGLANG_SERVED_MODEL_NAME}" \
|
|
--tp-size "${SGLANG_TP_SIZE}" \
|
|
--context-length "${SGLANG_CONTEXT_LENGTH}" \
|
|
--mem-fraction-static "${SGLANG_MEM_FRACTION_STATIC}" \
|
|
--attention-backend "${SGLANG_ATTENTION_BACKEND}" \
|
|
--dist-init-addr "${SGLANG_DIST_INIT_ADDR}" \
|
|
--enable-metrics \
|
|
--skip-server-warmup \
|
|
${SGLANG_EXTRA_ARGS}
|
|
EOF
|
|
sudo chmod 0755 /usr/local/bin/desineuron-sglang-launch.sh
|
|
|
|
sudo tee /etc/systemd/system/desineuron-sglang.service >/dev/null <<EOF
|
|
[Unit]
|
|
Description=Desineuron SGLang Runtime
|
|
After=network-online.target
|
|
Wants=network-online.target
|
|
|
|
[Service]
|
|
Type=simple
|
|
EnvironmentFile=/etc/default/desineuron-sglang
|
|
WorkingDirectory=${RUNTIME_ROOT}
|
|
ExecStart=/usr/local/bin/desineuron-sglang-launch.sh
|
|
Restart=always
|
|
RestartSec=5
|
|
LimitNOFILE=1048576
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
sudo systemctl daemon-reload
|
|
sudo systemctl enable --now desineuron-sglang.service
|
|
sudo systemctl --no-pager --full status desineuron-sglang.service
|