Velocity-OS/infrastructure/k3s/deployments/gpu-mig-config.yaml

# ============================================================
# Velocity-OS — NVIDIA MIG Configuration for K3s
# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
#
# MIG Strategy: Partition 96GB into two equal 48GB slices:
#   MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
#   MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
#
# Result: Concurrent zero-contention GPU execution.
# No operator toggle required.
#
# Prerequisites on workstation:
#   - nvidia-driver >= 550
#   - CUDA >= 12.4
#   - k3s with nvidia-container-toolkit
#   - NVIDIA device plugin with MIG support
# ============================================================

# ── Step 1: Enable MIG mode on the GPU ───────────────────────
# Run on workstation (one-time, survives reboot via service):
#   sudo nvidia-smi -i 0 --mig-mode=ENABLE
#   sudo reboot

# ── Step 2: Create MIG instances ─────────────────────────────
# Run after reboot:
#   sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
# This creates:
#   GPU instance 0: 3g.48gb (48GB) → MIG device 0
#   GPU instance 1: 3g.48gb (48GB) → MIG device 1
# Verify: nvidia-smi -L

---
# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nvidia-device-plugin
  namespace: kube-system
  labels:
    app: nvidia-device-plugin
spec:
  selector:
    matchLabels:
      app: nvidia-device-plugin
  template:
    metadata:
      labels:
        app: nvidia-device-plugin
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      containers:
        - name: nvidia-device-plugin
          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
          env:
            # "mixed" strategy: expose both full GPU and MIG devices
            - name: MIG_STRATEGY
              value: "mixed"
            - name: FAIL_ON_INIT_ERROR
              value: "false"
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: [ALL]
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins

---
# ── Node label: MIG-capable workstation ──────────────────────
# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
# This ensures GPU pods only schedule on the correct node.
apiVersion: v1
kind: ConfigMap
metadata:
  name: mig-setup-instructions
  namespace: velocity-os
data:
  README: |
    RTX 6000 Blackwell MIG Setup (run on workstation before deploying):

    1. Enable MIG mode:
       sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot

    2. Create two 3g.48gb instances (post-reboot):
       sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C

    3. Label K3s node:
       kubectl label node velocity-workstation \
         nvidia.com/mig.strategy=mixed \
         kubernetes.io/hostname=velocity-workstation

    4. Verify resource availability:
       kubectl describe node velocity-workstation | grep nvidia

    Expected output:
      nvidia.com/mig-3g.48gb: 2   (2 slices available)

    Deployment assignments:
      core-api       → nvidia.com/mig-3g.48gb: 1  (SGLang, slice 0)
      media-engine   → nvidia.com/mig-3g.48gb: 1  (ComfyUI, slice 1)