forked from sagnik/Velocity-OS
Initial commit: Velocity-OS migration
This commit is contained in:
107
infrastructure/k3s/deployments/gpu-mig-config.yaml
Normal file
107
infrastructure/k3s/deployments/gpu-mig-config.yaml
Normal file
@@ -0,0 +1,107 @@
|
||||
# ============================================================
|
||||
# Velocity-OS — NVIDIA MIG Configuration for K3s
|
||||
# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
|
||||
#
|
||||
# MIG Strategy: Partition 96GB into two equal 48GB slices:
|
||||
# MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
|
||||
# MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
|
||||
#
|
||||
# Result: Concurrent zero-contention GPU execution.
|
||||
# No operator toggle required.
|
||||
#
|
||||
# Prerequisites on workstation:
|
||||
# - nvidia-driver >= 550
|
||||
# - CUDA >= 12.4
|
||||
# - k3s with nvidia-container-toolkit
|
||||
# - NVIDIA device plugin with MIG support
|
||||
# ============================================================
|
||||
|
||||
# ── Step 1: Enable MIG mode on the GPU ───────────────────────
|
||||
# Run on workstation (one-time, survives reboot via service):
|
||||
# sudo nvidia-smi -i 0 --mig-mode=ENABLE
|
||||
# sudo reboot
|
||||
|
||||
# ── Step 2: Create MIG instances ─────────────────────────────
|
||||
# Run after reboot:
|
||||
# sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
|
||||
# This creates:
|
||||
# GPU instance 0: 3g.48gb (48GB) → MIG device 0
|
||||
# GPU instance 1: 3g.48gb (48GB) → MIG device 1
|
||||
# Verify: nvidia-smi -L
|
||||
|
||||
---
|
||||
# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: nvidia-device-plugin
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nvidia-device-plugin
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nvidia-device-plugin
|
||||
spec:
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: nvidia-device-plugin
|
||||
image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
|
||||
env:
|
||||
# "mixed" strategy: expose both full GPU and MIG devices
|
||||
- name: MIG_STRATEGY
|
||||
value: "mixed"
|
||||
- name: FAIL_ON_INIT_ERROR
|
||||
value: "false"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: [ALL]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
|
||||
---
|
||||
# ── Node label: MIG-capable workstation ──────────────────────
|
||||
# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
|
||||
# This ensures GPU pods only schedule on the correct node.
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: mig-setup-instructions
|
||||
namespace: velocity-os
|
||||
data:
|
||||
README: |
|
||||
RTX 6000 Blackwell MIG Setup (run on workstation before deploying):
|
||||
|
||||
1. Enable MIG mode:
|
||||
sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot
|
||||
|
||||
2. Create two 3g.48gb instances (post-reboot):
|
||||
sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
|
||||
|
||||
3. Label K3s node:
|
||||
kubectl label node velocity-workstation \
|
||||
nvidia.com/mig.strategy=mixed \
|
||||
kubernetes.io/hostname=velocity-workstation
|
||||
|
||||
4. Verify resource availability:
|
||||
kubectl describe node velocity-workstation | grep nvidia
|
||||
|
||||
Expected output:
|
||||
nvidia.com/mig-3g.48gb: 2 (2 slices available)
|
||||
|
||||
Deployment assignments:
|
||||
core-api → nvidia.com/mig-3g.48gb: 1 (SGLang, slice 0)
|
||||
media-engine → nvidia.com/mig-3g.48gb: 1 (ComfyUI, slice 1)
|
||||
Reference in New Issue
Block a user