108 lines
3.5 KiB
YAML
108 lines
3.5 KiB
YAML
# ============================================================
|
|
# Velocity-OS — NVIDIA MIG Configuration for K3s
|
|
# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
|
|
#
|
|
# MIG Strategy: Partition 96GB into two equal 48GB slices:
|
|
# MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
|
|
# MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
|
|
#
|
|
# Result: Concurrent zero-contention GPU execution.
|
|
# No operator toggle required.
|
|
#
|
|
# Prerequisites on workstation:
|
|
# - nvidia-driver >= 550
|
|
# - CUDA >= 12.4
|
|
# - k3s with nvidia-container-toolkit
|
|
# - NVIDIA device plugin with MIG support
|
|
# ============================================================
|
|
|
|
# ── Step 1: Enable MIG mode on the GPU ───────────────────────
|
|
# Run on workstation (one-time, survives reboot via service):
|
|
# sudo nvidia-smi -i 0 --mig-mode=ENABLE
|
|
# sudo reboot
|
|
|
|
# ── Step 2: Create MIG instances ─────────────────────────────
|
|
# Run after reboot:
|
|
# sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
|
|
# This creates:
|
|
# GPU instance 0: 3g.48gb (48GB) → MIG device 0
|
|
# GPU instance 1: 3g.48gb (48GB) → MIG device 1
|
|
# Verify: nvidia-smi -L
|
|
|
|
---
|
|
# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: nvidia-device-plugin
|
|
namespace: kube-system
|
|
labels:
|
|
app: nvidia-device-plugin
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app: nvidia-device-plugin
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: nvidia-device-plugin
|
|
spec:
|
|
tolerations:
|
|
- key: nvidia.com/gpu
|
|
operator: Exists
|
|
effect: NoSchedule
|
|
containers:
|
|
- name: nvidia-device-plugin
|
|
image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
|
|
env:
|
|
# "mixed" strategy: expose both full GPU and MIG devices
|
|
- name: MIG_STRATEGY
|
|
value: "mixed"
|
|
- name: FAIL_ON_INIT_ERROR
|
|
value: "false"
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
capabilities:
|
|
drop: [ALL]
|
|
volumeMounts:
|
|
- name: device-plugin
|
|
mountPath: /var/lib/kubelet/device-plugins
|
|
volumes:
|
|
- name: device-plugin
|
|
hostPath:
|
|
path: /var/lib/kubelet/device-plugins
|
|
|
|
---
|
|
# ── Node label: MIG-capable workstation ──────────────────────
|
|
# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
|
|
# This ensures GPU pods only schedule on the correct node.
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: mig-setup-instructions
|
|
namespace: velocity-os
|
|
data:
|
|
README: |
|
|
RTX 6000 Blackwell MIG Setup (run on workstation before deploying):
|
|
|
|
1. Enable MIG mode:
|
|
sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot
|
|
|
|
2. Create two 3g.48gb instances (post-reboot):
|
|
sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
|
|
|
|
3. Label K3s node:
|
|
kubectl label node velocity-workstation \
|
|
nvidia.com/mig.strategy=mixed \
|
|
kubernetes.io/hostname=velocity-workstation
|
|
|
|
4. Verify resource availability:
|
|
kubectl describe node velocity-workstation | grep nvidia
|
|
|
|
Expected output:
|
|
nvidia.com/mig-3g.48gb: 2 (2 slices available)
|
|
|
|
Deployment assignments:
|
|
core-api → nvidia.com/mig-3g.48gb: 1 (SGLang, slice 0)
|
|
media-engine → nvidia.com/mig-3g.48gb: 1 (ComfyUI, slice 1)
|