Initial commit: Velocity-OS migration

This commit is contained in:
2026-05-01 12:32:19 +05:30
commit 407af828d4
283 changed files with 207782 additions and 0 deletions

View File

@@ -0,0 +1,342 @@
# ============================================================
# Velocity-OS — K3s Deployments
# All services in velocity-os namespace.
# GPU: RTX 6000 Blackwell 96GB VRAM — MIG partitioned.
# MIG slice 0 (48GB): SGLang LLM inference (core-api)
# MIG slice 1 (48GB): ComfyUI media generation (media-engine)
# ============================================================
---
# ── PostgreSQL (StatefulSet for stable identity) ─────────────
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgres
namespace: velocity-os
labels:
app: postgres
tier: database
spec:
serviceName: postgres
replicas: 1
selector:
matchLabels:
app: postgres
template:
metadata:
labels:
app: postgres
tier: database
spec:
containers:
- name: postgres
image: ${ECR_REGISTRY}/postgres:15-alpine
ports:
- containerPort: 5432
env:
- name: POSTGRES_DB
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_DB
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: velocity-secrets
key: POSTGRES_PASSWORD
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
volumeMounts:
- name: postgres-data
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command: [pg_isready, -U, velocity]
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command: [pg_isready, -U, velocity]
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: postgres-data
persistentVolumeClaim:
claimName: pvc-postgres-data
---
# ── Redis (session cache, future queue) ──────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: velocity-os
labels:
app: redis
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: ${ECR_REGISTRY}/redis:7-alpine
ports:
- containerPort: 6379
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "250m"
args: ["--maxmemory", "400mb", "--maxmemory-policy", "allkeys-lru"]
---
# ── Core API (FastAPI) ────────────────────────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: core-api
namespace: velocity-os
labels:
app: core-api
tier: backend
spec:
replicas: 2
selector:
matchLabels:
app: core-api
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
template:
metadata:
labels:
app: core-api
tier: backend
spec:
# MIG slice 0: SGLang LLM inference
# The core-api pod requests MIG slice via resource limit
runtimeClassName: nvidia
containers:
- name: core-api
image: ${ECR_REGISTRY}/velocity-os/core:latest
ports:
- containerPort: 8443
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config
resources:
requests:
memory: "1Gi"
cpu: "500m"
# RTX 6000 Blackwell MIG 3g.48gb (SGLang slice)
nvidia.com/mig-3g.48gb: "1"
limits:
memory: "2Gi"
cpu: "1000m"
nvidia.com/mig-3g.48gb: "1"
volumeMounts:
- name: asset-store
mountPath: /opt/assets
- name: model-cache
mountPath: /opt/models
readOnly: true
livenessProbe:
httpGet:
path: /health
port: 8443
initialDelaySeconds: 20
periodSeconds: 15
readinessProbe:
httpGet:
path: /health
port: 8443
initialDelaySeconds: 10
periodSeconds: 5
volumes:
- name: asset-store
persistentVolumeClaim:
claimName: pvc-asset-store
- name: model-cache
persistentVolumeClaim:
claimName: pvc-model-cache
---
# ── WebOS (Nginx static + React) ─────────────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: webos
namespace: velocity-os
labels:
app: webos
tier: frontend
spec:
replicas: 2
selector:
matchLabels:
app: webos
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
template:
metadata:
labels:
app: webos
tier: frontend
spec:
containers:
- name: webos
image: ${ECR_REGISTRY}/velocity-os/webos:latest
ports:
- containerPort: 80
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "250m"
livenessProbe:
httpGet:
path: /health.txt
port: 80
initialDelaySeconds: 5
periodSeconds: 10
---
# ── Media Engine (Dream Weaver Gateway) ──────────────────────
apiVersion: apps/v1
kind: Deployment
metadata:
name: media-engine
namespace: velocity-os
labels:
app: media-engine
tier: ai
spec:
replicas: 1
selector:
matchLabels:
app: media-engine
template:
metadata:
labels:
app: media-engine
tier: ai
spec:
# MIG slice 1: ComfyUI media generation
runtimeClassName: nvidia
containers:
- name: media-engine
image: ${ECR_REGISTRY}/velocity-os/media-engine:latest
ports:
- containerPort: 8290
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config
resources:
requests:
memory: "2Gi"
cpu: "1000m"
# RTX 6000 Blackwell MIG 3g.48gb (ComfyUI slice)
nvidia.com/mig-3g.48gb: "1"
limits:
memory: "4Gi"
cpu: "2000m"
nvidia.com/mig-3g.48gb: "1"
volumeMounts:
- name: model-cache
mountPath: /opt/models
readOnly: true
- name: asset-store
mountPath: /opt/assets
livenessProbe:
httpGet:
path: /health
port: 8290
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: pvc-model-cache
- name: asset-store
persistentVolumeClaim:
claimName: pvc-asset-store
---
# ── DB Init Job (runs once: schema apply + seed) ─────────────
apiVersion: batch/v1
kind: Job
metadata:
name: db-init
namespace: velocity-os
labels:
app: db-init
spec:
# Never auto-restart; operator re-runs manually if needed
backoffLimit: 0
template:
metadata:
labels:
app: db-init
spec:
restartPolicy: Never
initContainers:
# Wait for postgres to be ready before running init
- name: wait-for-postgres
image: ${ECR_REGISTRY}/postgres:15-alpine
command: [sh, -c, "until pg_isready -h postgres -U $(POSTGRES_USER); do echo waiting...; sleep 2; done"]
envFrom:
- secretRef:
name: velocity-secrets
containers:
- name: db-init
image: ${ECR_REGISTRY}/velocity-os/core:latest
command:
- sh
- -c
- |
echo "=== Applying schemas ==="
psql $DATABASE_URL -f /app/db/schema.sql
psql $DATABASE_URL -f /app/db/schema_addendum.sql
psql $DATABASE_URL -f /app/db/schema_comms.sql
psql $DATABASE_URL -f /app/db/schema_crm_canonical.sql
psql $DATABASE_URL -f /app/oracle/schema_oracle.sql
psql $DATABASE_URL -f /app/oracle/schema_extension_v2.sql
echo "=== Seeding synthetic CRM v2 ==="
python /app/scripts/seed_synthetic_crm.py
echo "=== DB init complete ==="
envFrom:
- secretRef:
name: velocity-secrets
- configMapRef:
name: velocity-config

View File

@@ -0,0 +1,107 @@
# ============================================================
# Velocity-OS — NVIDIA MIG Configuration for K3s
# Target GPU: NVIDIA RTX 6000 Blackwell (96GB VRAM)
#
# MIG Strategy: Partition 96GB into two equal 48GB slices:
# MIG slice 0 (3g.48gb): SGLang LLM inference (core-api)
# MIG slice 1 (3g.48gb): ComfyUI media generation (media-engine)
#
# Result: Concurrent zero-contention GPU execution.
# No operator toggle required.
#
# Prerequisites on workstation:
# - nvidia-driver >= 550
# - CUDA >= 12.4
# - k3s with nvidia-container-toolkit
# - NVIDIA device plugin with MIG support
# ============================================================
# ── Step 1: Enable MIG mode on the GPU ───────────────────────
# Run on workstation (one-time, survives reboot via service):
# sudo nvidia-smi -i 0 --mig-mode=ENABLE
# sudo reboot
# ── Step 2: Create MIG instances ─────────────────────────────
# Run after reboot:
# sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
# This creates:
# GPU instance 0: 3g.48gb (48GB) → MIG device 0
# GPU instance 1: 3g.48gb (48GB) → MIG device 1
# Verify: nvidia-smi -L
---
# ── K3s: NVIDIA Device Plugin with MIG strategy ──────────────
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin
namespace: kube-system
labels:
app: nvidia-device-plugin
spec:
selector:
matchLabels:
app: nvidia-device-plugin
template:
metadata:
labels:
app: nvidia-device-plugin
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- name: nvidia-device-plugin
image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
env:
# "mixed" strategy: expose both full GPU and MIG devices
- name: MIG_STRATEGY
value: "mixed"
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ALL]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
---
# ── Node label: MIG-capable workstation ──────────────────────
# Apply once: kubectl label node velocity-workstation nvidia.com/mig.strategy=mixed
# This ensures GPU pods only schedule on the correct node.
apiVersion: v1
kind: ConfigMap
metadata:
name: mig-setup-instructions
namespace: velocity-os
data:
README: |
RTX 6000 Blackwell MIG Setup (run on workstation before deploying):
1. Enable MIG mode:
sudo nvidia-smi -i 0 --mig-mode=ENABLE && sudo reboot
2. Create two 3g.48gb instances (post-reboot):
sudo nvidia-smi mig -cgi "3g.48gb,3g.48gb" -C
3. Label K3s node:
kubectl label node velocity-workstation \
nvidia.com/mig.strategy=mixed \
kubernetes.io/hostname=velocity-workstation
4. Verify resource availability:
kubectl describe node velocity-workstation | grep nvidia
Expected output:
nvidia.com/mig-3g.48gb: 2 (2 slices available)
Deployment assignments:
core-api → nvidia.com/mig-3g.48gb: 1 (SGLang, slice 0)
media-engine → nvidia.com/mig-3g.48gb: 1 (ComfyUI, slice 1)

View File

@@ -0,0 +1,64 @@
{
email admin@desineuron.in
log {
output file /var/log/caddy/admin.log
format json
}
}
office.desineuron.in, git.desineuron.in, cloud.desineuron.in, projects.desineuron.in, talk.desineuron.in, vpn.desineuron.in {
tls /etc/caddy/tls/fullchain.pem /etc/caddy/tls/privkey.pem
log {
output file /var/log/caddy/access.log
format json
}
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
velocity.desineuron.in {
log {
output file /var/log/caddy/access.log
format json
}
import /etc/caddy/managed/llm_upstream.caddy_inc
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
ops.desineuron.in {
log {
output file /var/log/caddy/access.log
format json
}
reverse_proxy https://127.0.0.1:8443 {
header_up Host {host}
header_up X-Forwarded-Host {host}
header_up X-Forwarded-Proto {scheme}
header_up X-Forwarded-For {remote_host}
transport http {
tls_insecure_skip_verify
}
}
}
import /etc/caddy/managed/*.caddy

View File

@@ -0,0 +1,158 @@
# ============================================================
# Velocity-OS — K3s Traefik Ingress
# Domain: velocity.local | TLS: self-signed via cert-manager
# ============================================================
# ── cert-manager ClusterIssuer (self-signed for velocity.local) ──
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: velocity-selfsigned-issuer
spec:
selfSigned: {}
---
# Self-signed CA Certificate
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: velocity-local-ca
namespace: velocity-infra
spec:
isCA: true
commonName: velocity-local-ca
secretName: velocity-local-ca-secret
privateKey:
algorithm: ECDSA
size: 256
issuerRef:
name: velocity-selfsigned-issuer
kind: ClusterIssuer
group: cert-manager.io
---
# CA-backed ClusterIssuer for velocity.local
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: velocity-ca-issuer
spec:
ca:
secretName: velocity-local-ca-secret
---
# TLS Certificate for velocity.local
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: velocity-local-tls
namespace: velocity-os
spec:
secretName: velocity-local-tls-secret
duration: 8760h # 1 year
renewBefore: 720h # renew 30 days before expiry
subject:
organizations: [Desineuron]
commonName: velocity.local
dnsNames:
- velocity.local
- "*.velocity.local"
issuerRef:
name: velocity-ca-issuer
kind: ClusterIssuer
group: cert-manager.io
---
# ── Main Ingress ─────────────────────────────────────────────
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: velocity-os-ingress
namespace: velocity-os
annotations:
# Traefik (K3s built-in)
kubernetes.io/ingress.class: traefik
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
# WebSocket support (Sentinel, Oracle canvas, Catalyst)
traefik.ingress.kubernetes.io/router.middlewares: velocity-os-ws-headers@kubernetescrd
spec:
tls:
- hosts:
- velocity.local
secretName: velocity-local-tls-secret
rules:
- host: velocity.local
http:
paths:
# API (FastAPI backend)
- path: /api
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# WebSockets (must route before generic /api catch)
- path: /ws
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# Dream Weaver gateway
- path: /dream-weaver
pathType: Prefix
backend:
service:
name: media-engine
port:
number: 8290
# Vault public links (no auth)
- path: /vault
pathType: Prefix
backend:
service:
name: core-api
port:
number: 8443
# WebOS (React SPA — catch-all last)
- path: /
pathType: Prefix
backend:
service:
name: webos
port:
number: 80
---
# ── Traefik Middleware: WebSocket upgrade headers ─────────────
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: ws-headers
namespace: velocity-os
spec:
headers:
customRequestHeaders:
Connection: "Upgrade"
Upgrade: "websocket"
---
# ── Traefik Middleware: Security headers ─────────────────────
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: security-headers
namespace: velocity-os
spec:
headers:
stsSeconds: 31536000
stsIncludeSubdomains: true
forceSTSHeader: true
contentTypeNosniff: true
browserXssFilter: true
referrerPolicy: strict-origin-when-cross-origin
frameDeny: true

View File

@@ -0,0 +1,27 @@
# ============================================================
# Velocity-OS — K3s Namespaces
# ============================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-os
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-agents
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production
---
apiVersion: v1
kind: Namespace
metadata:
name: velocity-infra
labels:
app.kubernetes.io/managed-by: velocity-os
environment: production

View File

@@ -0,0 +1,82 @@
# ============================================================
# Velocity-OS — K3s Services
# ClusterIP for internal, none for headless StatefulSet
# ============================================================
---
apiVersion: v1
kind: Service
metadata:
name: postgres
namespace: velocity-os
labels:
app: postgres
spec:
clusterIP: None # Headless for StatefulSet stable DNS
selector:
app: postgres
ports:
- port: 5432
targetPort: 5432
---
apiVersion: v1
kind: Service
metadata:
name: redis
namespace: velocity-os
labels:
app: redis
spec:
type: ClusterIP
selector:
app: redis
ports:
- port: 6379
targetPort: 6379
---
apiVersion: v1
kind: Service
metadata:
name: core-api
namespace: velocity-os
labels:
app: core-api
spec:
type: ClusterIP
selector:
app: core-api
ports:
- name: http
port: 8443
targetPort: 8443
---
apiVersion: v1
kind: Service
metadata:
name: webos
namespace: velocity-os
labels:
app: webos
spec:
type: ClusterIP
selector:
app: webos
ports:
- name: http
port: 80
targetPort: 80
---
apiVersion: v1
kind: Service
metadata:
name: media-engine
namespace: velocity-os
labels:
app: media-engine
spec:
type: ClusterIP
selector:
app: media-engine
ports:
- name: http
port: 8290
targetPort: 8290

View File

@@ -0,0 +1,132 @@
# ============================================================
# Velocity-OS — K3s StorageClasses + PersistentVolumes
# Target: RTX 6000 Blackwell workstation NVMe drive
# ============================================================
---
# StorageClass: local-nvme (no provisioner — manually bound PVs)
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-nvme
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
reclaimPolicy: Retain
---
# PV: PostgreSQL data (50Gi on NVMe)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-postgres-data
labels:
app: postgres
spec:
capacity:
storage: 50Gi
accessModes: [ReadWriteOnce]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/data/postgres
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PV: AI model cache (500Gi — Wan 2.2, Qwen-Image, Qwen3.6)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-model-cache
labels:
app: model-cache
spec:
capacity:
storage: 500Gi
accessModes: [ReadOnlyMany]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/models
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PV: Generated asset store (200Gi)
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-asset-store
labels:
app: asset-store
spec:
capacity:
storage: 200Gi
accessModes: [ReadWriteMany]
persistentVolumeReclaimPolicy: Retain
storageClassName: local-nvme
local:
path: /opt/dlami/nvme/assets
nodeAffinity:
required:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values: [velocity-workstation]
---
# PVCs
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-postgres-data
namespace: velocity-os
spec:
accessModes: [ReadWriteOnce]
storageClassName: local-nvme
resources:
requests:
storage: 50Gi
selector:
matchLabels:
app: postgres
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-model-cache
namespace: velocity-os
spec:
accessModes: [ReadOnlyMany]
storageClassName: local-nvme
resources:
requests:
storage: 500Gi
selector:
matchLabels:
app: model-cache
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-asset-store
namespace: velocity-os
spec:
accessModes: [ReadWriteMany]
storageClassName: local-nvme
resources:
requests:
storage: 200Gi
selector:
matchLabels:
app: asset-store