#!/usr/bin/env bash set -euo pipefail NVME_ROOT="${NVME_ROOT:-/opt/dlami/nvme/sglang}" RUNTIME_ROOT="${RUNTIME_ROOT:-/opt/desineuron-sglang}" VENV_PATH="${RUNTIME_ROOT}/.venv" PORT="${SGLANG_PORT:-30100}" HOST="${SGLANG_HOST:-}" MODEL_ID="${SGLANG_MODEL_ID:-qwen3.6-35b-a3b}" MODEL_PATH="${SGLANG_MODEL_PATH:-/opt/dlami/nvme/models/Qwen-Qwen3.6-35B-A3B-FP8}" TP_SIZE="${SGLANG_TP_SIZE:-4}" CONTEXT_LENGTH="${SGLANG_CONTEXT_LENGTH:-131072}" MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_STATIC:-0.88}" ATTENTION_BACKEND="${SGLANG_ATTENTION_BACKEND:-flashinfer}" DIST_INIT_ADDR="${SGLANG_DIST_INIT_ADDR:-127.0.0.1:50000}" if [[ -z "${HOST}" ]]; then IMDS_TOKEN="$(curl -fsS -X PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600' || true)" if [[ -n "${IMDS_TOKEN}" ]]; then HOST="$(curl -fsS -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/local-ipv4 || true)" fi fi if [[ -z "${HOST}" ]]; then HOST="$(hostname -I | awk '{print $1}')" fi if [[ -z "${HOST}" ]]; then echo "Unable to resolve GPU private IP for SGLang host binding" >&2 exit 1 fi sudo mkdir -p "${NVME_ROOT}"/{cache,logs,state} "${RUNTIME_ROOT}" python3 -m venv "${VENV_PATH}" "${VENV_PATH}/bin/pip" install --upgrade pip wheel setuptools "${VENV_PATH}/bin/pip" install "sglang[all]>=0.5.3" flashinfer-python huggingface_hub sudo tee /etc/default/desineuron-sglang >/dev/null </dev/null <<'EOF' #!/usr/bin/env bash set -euo pipefail source /etc/default/desineuron-sglang export HF_HOME="${SGLANG_CACHE_DIR}/hf" export HUGGINGFACE_HUB_CACHE="${SGLANG_CACHE_DIR}/hf" export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export SGLANG_USE_FLASHINFER="${SGLANG_USE_FLASHINFER}" exec /opt/desineuron-sglang/.venv/bin/sglang serve \ --host "${SGLANG_HOST}" \ --port "${SGLANG_PORT}" \ --model-path "${SGLANG_MODEL_PATH}" \ --served-model-name "${SGLANG_SERVED_MODEL_NAME}" \ --tp-size "${SGLANG_TP_SIZE}" \ --context-length "${SGLANG_CONTEXT_LENGTH}" \ --mem-fraction-static "${SGLANG_MEM_FRACTION_STATIC}" \ --attention-backend "${SGLANG_ATTENTION_BACKEND}" \ --dist-init-addr "${SGLANG_DIST_INIT_ADDR}" \ --enable-metrics \ --skip-server-warmup \ ${SGLANG_EXTRA_ARGS} EOF sudo chmod 0755 /usr/local/bin/desineuron-sglang-launch.sh sudo tee /etc/systemd/system/desineuron-sglang.service >/dev/null <