Initial commit: Velocity-OS migration

2026-05-01 12:32:19 +05:30
commit 407af828d4
283 changed files with 207782 additions and 0 deletions
--- a/infrastructure/k3s/deployments/gpu-mig-config.yaml
+++ b/infrastructure/k3s/deployments/gpu-mig-config.yaml
@@ -0,0 +1,107 @@
+# ============================================================
+# Velocity-OS — NVIDIA MIG Configuration for K3s
+# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
+#
+# MIG Strategy: Partition 96GB into two equal 48GB slices:
+#   MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
+#   MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
+#
+# Result: Concurrent zero-contention GPU execution.
+# No operator toggle required.
+#
+# Prerequisites on workstation:
+#   - nvidia-driver >= 550
+#   - CUDA >= 12.4
+#   - k3s with nvidia-container-toolkit
+#   - NVIDIA device plugin with MIG support
+# ============================================================
+
+# ── Step 1: Enable MIG mode on the GPU ───────────────────────
+# Run on workstation (one-time, survives reboot via service):
+#   sudo nvidia-smi -i 0 --mig-mode=ENABLE
+#   sudo reboot
+
+# ── Step 2: Create MIG instances ─────────────────────────────
+# Run after reboot:
+#   sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
+# This creates:
+#   GPU instance 0: 3g.48gb (48GB) → MIG device 0
+#   GPU instance 1: 3g.48gb (48GB) → MIG device 1
+# Verify: nvidia-smi -L
+
+---
+# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin
+  namespace: kube-system
+  labels:
+    app: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      app: nvidia-device-plugin
+  template:
+    metadata:
+      labels:
+        app: nvidia-device-plugin
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      containers:
+        - name: nvidia-device-plugin
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          env:
+            # "mixed" strategy: expose both full GPU and MIG devices
+            - name: MIG_STRATEGY
+              value: "mixed"
+            - name: FAIL_ON_INIT_ERROR
+              value: "false"
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: [ALL]
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+
+---
+# ── Node label: MIG-capable workstation ──────────────────────
+# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
+# This ensures GPU pods only schedule on the correct node.
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mig-setup-instructions
+  namespace: velocity-os
+data:
+  README: |
+    RTX 6000 Blackwell MIG Setup (run on workstation before deploying):
+
+    1. Enable MIG mode:
+       sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot
+
+    2. Create two 3g.48gb instances (post-reboot):
+       sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
+
+    3. Label K3s node:
+       kubectl label node velocity-workstation \
+         nvidia.com/mig.strategy=mixed \
+         kubernetes.io/hostname=velocity-workstation
+
+    4. Verify resource availability:
+       kubectl describe node velocity-workstation | grep nvidia
+
+    Expected output:
+      nvidia.com/mig-3g.48gb: 2   (2 slices available)
+
+    Deployment assignments:
+      core-api       → nvidia.com/mig-3g.48gb: 1  (SGLang, slice 0)
+      media-engine   → nvidia.com/mig-3g.48gb: 1  (ComfyUI, slice 1)