Built the Sentinel Tab

This commit is contained in:
Sagnik
2026-04-12 02:02:58 +05:30
parent fb656d1443
commit 075ab280ad
526 changed files with 17646 additions and 70931 deletions

View File

@@ -0,0 +1,37 @@
OPS_DB_NAME=desineuron_ops
OPS_DB_USER=desineuron_ops
OPS_DB_PASSWORD=change-me
OPS_DATABASE_URL=postgresql+psycopg://desineuron_ops:change-me@ops-db:5432/desineuron_ops
OPS_SESSION_SECRET=change-me
OPS_ADMIN_USERNAME=sagnik
OPS_ADMIN_PASSWORD=change-me
OPS_TEAM_USERS_JSON=[]
OPS_DEFAULT_REGION=us-east-1
OPS_VISIBLE_REGIONS=us-east-1,ap-south-1,eu-west-1
OPS_BUCKET_NAME=
OPS_BUCKET_REGION=us-east-1
OPS_SSH_KEY_PATH=/app/state/desineuron-l4-node.pem
OPS_GPU_SSH_USER=ubuntu
OPS_INGRESS_SSH_HOST=98.87.120.120
OPS_INGRESS_SSH_USER=ec2-user
OPS_INGRESS_PRIVATE_IP=172.31.41.26
OPS_INGRESS_SSH_PORT=22
OPS_LINUX_PUBLIC_BASE_URL=https://ops.desineuron.in
OPS_PRICE_EBS_GP3_PER_GB_MONTH=0.08
OPS_PRICE_PUBLIC_IPV4_PER_HOUR=0.005
OPS_ALLOWED_MACHINE_IDS=i-094df09acafb72494,i-0e4eab5fe67cf9abe
OPS_GPU_SUBNET_ID=subnet-03d684ed15f327151
OPS_GPU_SECURITY_GROUP_IDS=sg-05e4de3fe94ad6558,sg-0b144c17b1b89f4c6
OPS_GPU_KEY_NAME=desineuron-l4-node
OPS_GPU_AMI_ID=ami-0016081b488c7376d
OPS_GPU_INSTANCE_PROFILE=Synapse-Training-Profile
OPS_GPU_ROOT_VOLUME_GB=300
OPS_GPU_WORKER_SCRIPT_PATH=/app/ops_control_plane/worker.py
OPS_CSV_EXPORT_DIR=/app/exports
OPS_LOG_DIR=/app/logs
OPS_STATE_DIR=/app/state
OPS_MODEL_LIBRARY_HOST_PATH=/mnt/ServerStorage/ai-models/models
OPS_MODEL_LIBRARY_ROOT=/model-library
OPS_INGRESS_ROUTE_HELPER=/usr/local/bin/manage_desineuron_routes.py
OPS_CLOUDFLARE_ZONE_NAME=desineuron.in
OPS_CLOUDFLARE_API_TOKEN=

View File

@@ -0,0 +1,78 @@
# Desineuron Ops Control Plane
Internal Linux-hosted control surface for:
- AWS machine lifecycle
- S3-backed model ingest with generated manifests and checksums
- model hydration from S3
- runtime and estimated cost tracking
- ingress route management
- session logging and CSV export
Main deployment target:
- Linux box at `192.168.1.4`
Primary public route:
- `ops.desineuron.in`
Canonical S3 bucket:
- `desineuron-ops-control-plane-819079556187-us-east-1`
Related AWS nodes:
- ingress: `i-094df09acafb72494`
- current GPU worker: `i-0e4eab5fe67cf9abe`
Core runtime:
- FastAPI web + API surface
- background worker
- PostgreSQL
- Docker Compose
- systemd wrapper on Linux
Key files:
- `docker-compose.yml`
- `.env.example`
- `app/ops_control_plane/main.py`
- `app/ops_control_plane/worker.py`
- `app/ops_control_plane/cli.py`
- `manage_desineuron_routes.py`
- `install_linux_ops_control_plane.sh`
Runtime paths on Linux:
- stack root: `/opt/desineuron-ops-control-plane`
- env file: `/opt/desineuron-ops-control-plane/.env`
- exports: `/opt/desineuron-ops-control-plane/exports`
- state: `/opt/desineuron-ops-control-plane/state`
Access:
- login route: `https://ops.desineuron.in/login`
- operator logins are provisioned as email-style usernames
- admin password is stored in the protected `.env` file on Linux and should be retrieved locally rather than copied into repo notes
Validated live behaviors:
- market pricing API returns live on-demand and spot views
- session and cost tracking persist in PostgreSQL and export to CSV
- spot launch failures are recorded cleanly instead of crashing the UI
- on-demand GPU launch was validated with a `g6.xlarge` lifecycle test
- managed ingress route upsert/delete was validated through the helper on the `t4g.micro` ingress
- model ingest from Linux model library to S3 was validated with `ops-smoke-model`, including manifest generation and catalog registration
Operator retrieval commands:
- admin password:
- `sudo sed -n 's/^OPS_ADMIN_PASSWORD=//p' /opt/desineuron-ops-control-plane/.env`
- latest CSV export:
- `ls -lah /opt/desineuron-ops-control-plane/exports`
Installer safety note:
- `install_linux_ops_control_plane.sh` intentionally excludes runtime directories (`data/`, `exports/`, `logs/`, `state/`, `.env`) from code sync so redeploys do not corrupt Postgres state or overwrite secrets

View File

@@ -0,0 +1,16 @@
FROM python:3.12-slim
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN apt-get update \
&& apt-get install -y --no-install-recommends openssh-client curl ca-certificates \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir -r /app/requirements.txt
COPY ops_control_plane /app/ops_control_plane
CMD ["python", "-m", "ops_control_plane.main"]

View File

@@ -0,0 +1 @@
__all__ = ["main"]

View File

@@ -0,0 +1,549 @@
from __future__ import annotations
import csv
import hashlib
import io
import json
import shlex
import subprocess
from pathlib import Path
from collections.abc import Iterable
from datetime import datetime, timezone
import boto3
from botocore.exceptions import ClientError
from sqlalchemy import select
from sqlalchemy.orm import Session
from .config import settings
from .models import AuditEvent, Machine, MachineModelCache, MachineProfile, MarketSnapshot, ModelCatalog, RouteBinding, Session as RuntimeSession, SessionCost
REGION_LOCATION_MAP = {
"us-east-1": "US East (N. Virginia)",
"ap-south-1": "Asia Pacific (Mumbai)",
"eu-west-1": "EU (Ireland)",
}
ON_DEMAND_PRICE_FALLBACKS = {
("us-east-1", "t4g.micro"): 0.0084,
}
def utcnow() -> datetime:
return datetime.now(timezone.utc)
def ec2_client(region: str):
return boto3.client("ec2", region_name=region)
def pricing_client():
return boto3.client("pricing", region_name="us-east-1")
def s3_client(region: str | None = None):
return boto3.client("s3", region_name=region or settings.bucket_region)
def ensure_bucket(bucket_name: str, region: str) -> None:
client = s3_client(region)
try:
client.head_bucket(Bucket=bucket_name)
except ClientError as exc:
code = exc.response.get("Error", {}).get("Code", "")
if code in {"404", "NoSuchBucket", "NotFound"}:
if region == "us-east-1":
client.create_bucket(Bucket=bucket_name)
else:
client.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration={"LocationConstraint": region},
)
elif code not in {"301", "403"}:
raise
client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})
client.put_bucket_encryption(
Bucket=bucket_name,
ServerSideEncryptionConfiguration={
"Rules": [{"ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}}]
},
)
def seed_bucket_prefixes(bucket_name: str) -> None:
client = s3_client()
for prefix in [
"models/",
"workflows/",
"references/",
"outputs/",
"manifests/",
"bootstrap/",
]:
client.put_object(Bucket=bucket_name, Key=prefix)
def resolve_model_source_dir(source_relative_path: str) -> Path:
source = (settings.model_library_root / source_relative_path).resolve()
root = settings.model_library_root.resolve()
if root not in source.parents and source != root:
raise ValueError("Model source path escapes configured model library root")
if not source.exists() or not source.is_dir():
raise FileNotFoundError(f"Model source directory not found: {source}")
return source
def build_model_manifest(source_dir: Path) -> dict:
files: list[dict] = []
total_size = 0
for path in sorted(p for p in source_dir.rglob("*") if p.is_file()):
rel = path.relative_to(source_dir).as_posix()
sha256 = hashlib.sha256()
with path.open("rb") as handle:
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
sha256.update(chunk)
size_bytes = path.stat().st_size
total_size += size_bytes
files.append({"path": rel, "sha256": sha256.hexdigest(), "size_bytes": size_bytes})
return {
"generated_at": utcnow().isoformat(),
"file_count": len(files),
"total_size_bytes": total_size,
"files": files,
}
def upload_model_directory(bucket_name: str, model_key: str, source_relative_path: str, label: str, workload_tags: list[str] | None = None, compatibility_tags: list[str] | None = None) -> dict:
source_dir = resolve_model_source_dir(source_relative_path)
manifest = build_model_manifest(source_dir)
client = s3_client()
s3_prefix = f"models/{model_key}/"
for file_entry in manifest["files"]:
local_path = source_dir / Path(file_entry["path"])
client.upload_file(str(local_path), bucket_name, s3_prefix + file_entry["path"])
manifest_key = f"manifests/models/{model_key}.json"
client.put_object(
Bucket=bucket_name,
Key=manifest_key,
Body=json.dumps(manifest, indent=2).encode("utf-8"),
ContentType="application/json",
)
return {
"model_key": model_key,
"label": label,
"source_dir": str(source_dir),
"s3_prefix": s3_prefix,
"manifest_key": manifest_key,
"manifest": manifest,
"workload_tags": workload_tags or [],
"compatibility_tags": compatibility_tags or [],
}
def fetch_on_demand_price(region: str, instance_type: str) -> float | None:
location = REGION_LOCATION_MAP.get(region)
if not location:
return None
response = pricing_client().get_products(
ServiceCode="AmazonEC2",
Filters=[
{"Type": "TERM_MATCH", "Field": "instanceType", "Value": instance_type},
{"Type": "TERM_MATCH", "Field": "location", "Value": location},
{"Type": "TERM_MATCH", "Field": "operatingSystem", "Value": "Linux"},
{"Type": "TERM_MATCH", "Field": "tenancy", "Value": "Shared"},
{"Type": "TERM_MATCH", "Field": "preInstalledSw", "Value": "NA"},
{"Type": "TERM_MATCH", "Field": "capacitystatus", "Value": "Used"},
],
MaxResults=1,
)
for price_item in response.get("PriceList", []):
item = json.loads(price_item)
terms = item.get("terms", {}).get("OnDemand", {})
for term in terms.values():
for dimension in term.get("priceDimensions", {}).values():
price = dimension.get("pricePerUnit", {}).get("USD")
if price:
return float(price)
return ON_DEMAND_PRICE_FALLBACKS.get((region, instance_type))
def refresh_market_snapshots(db: Session, regions: Iterable[str], profile_rows: Iterable[MachineProfile]) -> None:
seen: set[tuple[str, str]] = set()
for profile in profile_rows:
seen.add((profile.region, profile.instance_type))
for region in regions:
region_profiles = [p for p in profile_rows if p.region == region]
instance_types = {p.instance_type for p in region_profiles}
if not instance_types:
continue
ec2 = ec2_client(region)
offerings = ec2.describe_instance_type_offerings(
LocationType="region",
Filters=[{"Name": "instance-type", "Values": sorted(instance_types)}],
)["InstanceTypeOfferings"]
available = {item["InstanceType"] for item in offerings}
for instance_type in instance_types:
on_demand_price = fetch_on_demand_price(region, instance_type)
db.add(
MarketSnapshot(
region=region,
instance_type=instance_type,
lifecycle="on-demand",
offering_available=instance_type in available,
hourly_price_usd=on_demand_price,
raw_payload={"instance_type": instance_type, "region": region},
)
)
try:
spot_history = ec2.describe_spot_price_history(
InstanceTypes=[instance_type],
ProductDescriptions=["Linux/UNIX"],
StartTime=utcnow(),
MaxResults=1,
)["SpotPriceHistory"]
spot_price = float(spot_history[0]["SpotPrice"]) if spot_history else None
except ClientError:
spot_price = None
db.add(
MarketSnapshot(
region=region,
instance_type=instance_type,
lifecycle="spot",
offering_available=instance_type in available and spot_price is not None,
hourly_price_usd=spot_price,
raw_payload={"instance_type": instance_type, "region": region},
)
)
def latest_market_price(db: Session, region: str, instance_type: str, lifecycle: str) -> float:
row = db.scalar(
select(MarketSnapshot)
.where(
MarketSnapshot.region == region,
MarketSnapshot.instance_type == instance_type,
MarketSnapshot.lifecycle == lifecycle,
)
.order_by(MarketSnapshot.observed_at.desc())
)
return row.hourly_price_usd if row and row.hourly_price_usd is not None else 0.0
def sync_instances(db: Session, regions: Iterable[str]) -> None:
for region in regions:
ec2 = ec2_client(region)
reservations = ec2.describe_instances()["Reservations"]
for reservation in reservations:
for instance in reservation["Instances"]:
instance_id = instance["InstanceId"]
launch_time = instance.get("LaunchTime")
if launch_time and launch_time.tzinfo is None:
launch_time = launch_time.replace(tzinfo=timezone.utc)
public_ip = instance.get("PublicIpAddress")
private_ip = instance.get("PrivateIpAddress")
state_name = instance["State"]["Name"]
volume_size = 0
if instance.get("BlockDeviceMappings"):
try:
volume_ids = [b["Ebs"]["VolumeId"] for b in instance["BlockDeviceMappings"] if "Ebs" in b]
if volume_ids:
volumes = ec2.describe_volumes(VolumeIds=volume_ids)["Volumes"]
volume_size = sum(v.get("Size", 0) for v in volumes)
except ClientError:
volume_size = 0
existing = db.scalar(select(Machine).where(Machine.aws_instance_id == instance_id))
tags = {tag["Key"]: tag["Value"] for tag in instance.get("Tags", [])}
payload = {
"key_name": instance.get("KeyName"),
"subnet_id": instance.get("SubnetId"),
"security_groups": instance.get("SecurityGroups", []),
"image_id": instance.get("ImageId"),
"iam_instance_profile": instance.get("IamInstanceProfile", {}).get("Arn"),
"availability_zone": instance.get("Placement", {}).get("AvailabilityZone"),
"public_dns": instance.get("PublicDnsName"),
}
if existing:
existing.name = tags.get("Name", instance_id)
existing.region = region
existing.instance_type = instance["InstanceType"]
existing.lifecycle = instance.get("InstanceLifecycle", "on-demand")
existing.state = state_name
existing.public_ip = public_ip
existing.private_ip = private_ip
existing.launch_time = launch_time
existing.volume_gb = volume_size
existing.public_ipv4_attached = bool(public_ip)
existing.details = payload
else:
db.add(
Machine(
aws_instance_id=instance_id,
name=tags.get("Name", instance_id),
region=region,
profile_name=tags.get("DesineuronProfile"),
instance_type=instance["InstanceType"],
lifecycle=instance.get("InstanceLifecycle", "on-demand"),
state=state_name,
public_ip=public_ip,
private_ip=private_ip,
launch_time=launch_time,
volume_gb=volume_size,
public_ipv4_attached=bool(public_ip),
details=payload,
)
)
def candidate_subnet_ids(region: str, preferred_subnet_id: str) -> list[str]:
if not preferred_subnet_id:
return []
ec2 = ec2_client(region)
subnet_response = ec2.describe_subnets(SubnetIds=[preferred_subnet_id])["Subnets"]
if not subnet_response:
return [preferred_subnet_id]
preferred = subnet_response[0]
vpc_id = preferred["VpcId"]
subnets = ec2.describe_subnets(
Filters=[
{"Name": "vpc-id", "Values": [vpc_id]},
{"Name": "state", "Values": ["available"]},
]
)["Subnets"]
ranked: list[tuple[int, str, str]] = []
for subnet in subnets:
subnet_id = subnet["SubnetId"]
az = subnet.get("AvailabilityZone", "")
score = 2
if subnet_id == preferred_subnet_id:
score = 0
elif subnet.get("MapPublicIpOnLaunch"):
score = 1
ranked.append((score, az, subnet_id))
return [subnet_id for _, _, subnet_id in sorted(ranked)]
def calculate_machine_cost(machine: Machine, hourly_rate: float) -> dict:
if not machine.launch_time:
runtime_hours = 0.0
else:
runtime_hours = max((utcnow() - machine.launch_time).total_seconds() / 3600.0, 0.0)
compute_cost = runtime_hours * hourly_rate
storage_hourly = (machine.volume_gb * settings.ebs_gp3_per_gb_month) / 730.0
storage_cost = runtime_hours * storage_hourly
public_ip_cost = runtime_hours * settings.public_ipv4_per_hour if machine.public_ipv4_attached else 0.0
return {
"runtime_hours": round(runtime_hours, 3),
"compute_cost_usd": round(compute_cost, 4),
"storage_cost_usd": round(storage_cost, 4),
"public_ip_cost_usd": round(public_ip_cost, 4),
"total_cost_usd": round(compute_cost + storage_cost + public_ip_cost, 4),
"hourly_price_usd": round(hourly_rate + storage_hourly + (settings.public_ipv4_per_hour if machine.public_ipv4_attached else 0.0), 4),
}
def upsert_session_cost(db: Session, session_row: RuntimeSession, machine: Machine) -> None:
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle or "on-demand")
cost_payload = calculate_machine_cost(machine, hourly_rate)
record = db.scalar(
select(SessionCost).where(SessionCost.session_id == session_row.id).order_by(SessionCost.calculated_at.desc())
)
if record:
record.runtime_hours = cost_payload["runtime_hours"]
record.compute_cost_usd = cost_payload["compute_cost_usd"]
record.storage_cost_usd = cost_payload["storage_cost_usd"]
record.public_ip_cost_usd = cost_payload["public_ip_cost_usd"]
record.total_cost_usd = cost_payload["total_cost_usd"]
record.calculated_at = utcnow()
else:
db.add(SessionCost(session_id=session_row.id, **cost_payload))
def create_managed_instance(db: Session, profile: MachineProfile, actor: str, lifecycle: str) -> RuntimeSession:
ec2 = ec2_client(profile.region)
launch_config = profile.launch_config
base_run_args = {
"ImageId": launch_config["ami_id"],
"InstanceType": profile.instance_type,
"SecurityGroupIds": launch_config["security_group_ids"],
"KeyName": launch_config["key_name"],
"IamInstanceProfile": {"Name": launch_config["instance_profile"]},
"MinCount": 1,
"MaxCount": 1,
"BlockDeviceMappings": [
{
"DeviceName": "/dev/sda1",
"Ebs": {
"VolumeSize": int(launch_config.get("root_volume_gb", settings.gpu_root_volume_gb)),
"VolumeType": "gp3",
"DeleteOnTermination": True,
},
}
],
"TagSpecifications": [
{
"ResourceType": "instance",
"Tags": [
{"Key": "Name", "Value": f"desineuron-{profile.name}-{int(utcnow().timestamp())}"},
{"Key": "ManagedBy", "Value": "DesineuronOps"},
{"Key": "DesineuronProfile", "Value": profile.name},
],
}
],
}
if lifecycle == "spot":
base_run_args["InstanceMarketOptions"] = {
"MarketType": "spot",
"SpotOptions": {"SpotInstanceType": "one-time", "InstanceInterruptionBehavior": "terminate"},
}
subnet_ids = candidate_subnet_ids(profile.region, launch_config["subnet_id"]) or [launch_config["subnet_id"]]
last_exc: Exception | None = None
response = None
chosen_subnet = launch_config["subnet_id"]
for subnet_id in subnet_ids:
run_args = dict(base_run_args)
run_args["SubnetId"] = subnet_id
try:
response = ec2.run_instances(**run_args)
chosen_subnet = subnet_id
break
except ClientError as exc:
last_exc = exc
error_code = exc.response.get("Error", {}).get("Code")
if error_code not in {"InsufficientInstanceCapacity", "MaxSpotInstanceCountExceeded", "Unsupported"}:
raise
continue
if response is None:
assert last_exc is not None
raise last_exc
instance = response["Instances"][0]
machine = Machine(
aws_instance_id=instance["InstanceId"],
name=f"desineuron-{profile.name}",
region=profile.region,
profile_name=profile.name,
instance_type=profile.instance_type,
lifecycle=lifecycle,
state=instance["State"]["Name"],
public_ip=instance.get("PublicIpAddress"),
private_ip=instance.get("PrivateIpAddress"),
launch_time=instance.get("LaunchTime"),
volume_gb=int(launch_config.get("root_volume_gb", settings.gpu_root_volume_gb)),
public_ipv4_attached=True,
details={"launched_by": actor, "chosen_subnet_id": chosen_subnet},
)
db.add(machine)
db.flush()
session_row = RuntimeSession(machine_id=machine.id, actor=actor, workload_name=profile.name, status="active")
db.add(session_row)
db.add(AuditEvent(actor=actor, action="launch_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={"profile": profile.name, "lifecycle": lifecycle}))
return session_row
def stop_machine(db: Session, machine: Machine, actor: str) -> None:
ec2 = ec2_client(machine.region)
ec2.stop_instances(InstanceIds=[machine.aws_instance_id])
machine.state = "stopping"
db.add(AuditEvent(actor=actor, action="stop_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={}))
def terminate_machine(db: Session, machine: Machine, actor: str) -> None:
ec2 = ec2_client(machine.region)
ec2.terminate_instances(InstanceIds=[machine.aws_instance_id])
machine.state = "shutting-down"
db.add(AuditEvent(actor=actor, action="terminate_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={}))
def ssh_run(host: str, user: str, command: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=NUL",
"-i",
str(settings.ssh_key_path),
f"{user}@{host}",
command,
],
capture_output=True,
text=True,
check=False,
)
def hydrate_model(machine: Machine, model_prefix: str, actor: str, bucket_name: str) -> dict:
if not machine.public_ip:
raise RuntimeError("Machine has no public IP for hydration")
install_cmd = (
"command -v s5cmd >/dev/null 2>&1 || "
"curl -L https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz "
"| tar -xz -C /tmp && sudo mv /tmp/s5cmd /usr/local/bin/s5cmd"
)
ssh_run(machine.public_ip, settings.gpu_ssh_user, install_cmd)
remote_dir = f"/opt/dlami/nvme/models/{model_prefix.split('/')[-2]}"
copy_cmd = (
f"mkdir -p {remote_dir} && "
f"s5cmd cp 's3://{bucket_name}/{model_prefix}*' '{remote_dir}/'"
)
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, copy_cmd)
verify_result = None
manifest_key = f"manifests/models/{model_prefix.rstrip('/').split('/')[-1]}.json"
try:
manifest_obj = s3_client().get_object(Bucket=bucket_name, Key=manifest_key)
manifest = json.loads(manifest_obj["Body"].read().decode("utf-8"))
checks = " && ".join(
f"test -f {shlex.quote(remote_dir + '/' + entry['path'])}"
for entry in manifest.get("files", [])
) or "true"
verify = ssh_run(machine.public_ip, settings.gpu_ssh_user, checks)
verify_result = {"stdout": verify.stdout, "stderr": verify.stderr, "returncode": verify.returncode}
except ClientError:
verify_result = {"stdout": "", "stderr": "manifest_missing", "returncode": 1}
return {
"stdout": result.stdout,
"stderr": result.stderr,
"returncode": result.returncode,
"remote_dir": remote_dir,
"verify": verify_result,
}
def start_service(machine: Machine, service_name: str) -> dict:
if not machine.public_ip:
raise RuntimeError("Machine has no public IP")
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, f"sudo systemctl start {service_name} && sudo systemctl is-active {service_name}")
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
def stop_service(machine: Machine, service_name: str) -> dict:
if not machine.public_ip:
raise RuntimeError("Machine has no public IP")
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, f"sudo systemctl stop {service_name}")
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
def export_sessions_csv(db: Session, target_path: str) -> str:
rows = db.execute(
select(
RuntimeSession.id,
RuntimeSession.actor,
RuntimeSession.workload_name,
RuntimeSession.status,
RuntimeSession.started_at,
RuntimeSession.ended_at,
SessionCost.runtime_hours,
SessionCost.compute_cost_usd,
SessionCost.storage_cost_usd,
SessionCost.public_ip_cost_usd,
SessionCost.total_cost_usd,
).join(SessionCost, SessionCost.session_id == RuntimeSession.id, isouter=True)
)
with open(target_path, "w", newline="", encoding="utf-8") as handle:
writer = csv.writer(handle)
writer.writerow(["session_id", "actor", "workload", "status", "started_at", "ended_at", "runtime_hours", "compute_cost_usd", "storage_cost_usd", "public_ip_cost_usd", "total_cost_usd"])
for row in rows:
writer.writerow(row)
return target_path

View File

@@ -0,0 +1,79 @@
from __future__ import annotations
import json
from pathlib import Path
import typer
from sqlalchemy import select
from .aws_control import calculate_machine_cost, create_managed_instance, export_sessions_csv, latest_market_price, stop_machine, terminate_machine
from .database import Base, engine, session_scope
from .models import AuditEvent, Machine, MachineProfile, Session as RuntimeSession
app = typer.Typer(help="Desineuron Ops CLI")
@app.command("machine-list")
def machine_list():
with session_scope() as db:
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
for machine in machines:
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
cost = calculate_machine_cost(machine, hourly_rate)
typer.echo(f"{machine.aws_instance_id} {machine.instance_type} {machine.state} ${cost['total_cost_usd']:.4f}")
@app.command("machine-launch")
def machine_launch(profile_name: str, lifecycle: str = "spot", actor: str = "cli"):
with session_scope() as db:
profile = db.scalar(select(MachineProfile).where(MachineProfile.name == profile_name))
if not profile:
raise typer.BadParameter(f"Unknown profile: {profile_name}")
session_row = create_managed_instance(db, profile, actor, lifecycle)
typer.echo(json.dumps({"session_id": session_row.id, "profile": profile_name, "lifecycle": lifecycle}))
@app.command("machine-stop")
def machine_stop(machine_id: str, actor: str = "cli"):
with session_scope() as db:
machine = db.scalar(select(Machine).where(Machine.aws_instance_id == machine_id))
if not machine:
raise typer.BadParameter(f"Unknown machine: {machine_id}")
stop_machine(db, machine, actor)
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
if active_session:
active_session.status = "stopped"
typer.echo(json.dumps({"machine": machine_id, "status": "stopping"}))
@app.command("machine-terminate")
def machine_terminate(machine_id: str, actor: str = "cli"):
with session_scope() as db:
machine = db.scalar(select(Machine).where(Machine.aws_instance_id == machine_id))
if not machine:
raise typer.BadParameter(f"Unknown machine: {machine_id}")
terminate_machine(db, machine, actor)
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
if active_session:
active_session.status = "terminated"
typer.echo(json.dumps({"machine": machine_id, "status": "terminating"}))
@app.command("audit-tail")
def audit_tail(limit: int = 20):
with session_scope() as db:
events = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(limit)).all()
for event in events:
typer.echo(json.dumps({"actor": event.actor, "action": event.action, "entity": event.entity_id, "created_at": event.created_at.isoformat()}))
@app.command("export-sessions")
def export_sessions(output: Path = Path("/app/exports/sessions_cli.csv")):
with session_scope() as db:
export_sessions_csv(db, str(output))
typer.echo(str(output))
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
import os
import json
from dataclasses import field
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Settings:
database_url: str = os.environ["OPS_DATABASE_URL"]
session_secret: str = os.environ["OPS_SESSION_SECRET"]
admin_username: str = os.environ.get("OPS_ADMIN_USERNAME", "sagnik")
admin_password: str = os.environ["OPS_ADMIN_PASSWORD"]
team_users_json: str = os.environ.get("OPS_TEAM_USERS_JSON", "[]")
default_region: str = os.environ.get("OPS_DEFAULT_REGION", "us-east-1")
visible_regions: tuple[str, ...] = tuple(
region.strip() for region in os.environ.get("OPS_VISIBLE_REGIONS", "us-east-1").split(",") if region.strip()
)
bucket_name: str = os.environ.get("OPS_BUCKET_NAME", "")
bucket_region: str = os.environ.get("OPS_BUCKET_REGION", "us-east-1")
ssh_key_path: Path = Path(os.environ.get("OPS_SSH_KEY_PATH", "/app/state/desineuron-l4-node.pem"))
gpu_ssh_user: str = os.environ.get("OPS_GPU_SSH_USER", "ubuntu")
ingress_ssh_host: str = os.environ.get("OPS_INGRESS_SSH_HOST", "")
ingress_ssh_user: str = os.environ.get("OPS_INGRESS_SSH_USER", "ec2-user")
ingress_ssh_port: int = int(os.environ.get("OPS_INGRESS_SSH_PORT", "22"))
ingress_route_helper: str = os.environ.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
public_base_url: str = os.environ.get("OPS_LINUX_PUBLIC_BASE_URL", "https://ops.desineuron.in")
ebs_gp3_per_gb_month: float = float(os.environ.get("OPS_PRICE_EBS_GP3_PER_GB_MONTH", "0.08"))
public_ipv4_per_hour: float = float(os.environ.get("OPS_PRICE_PUBLIC_IPV4_PER_HOUR", "0.005"))
allowed_machine_ids: tuple[str, ...] = tuple(
machine.strip() for machine in os.environ.get("OPS_ALLOWED_MACHINE_IDS", "").split(",") if machine.strip()
)
gpu_subnet_id: str = os.environ.get("OPS_GPU_SUBNET_ID", "")
gpu_security_group_ids: tuple[str, ...] = tuple(
group.strip() for group in os.environ.get("OPS_GPU_SECURITY_GROUP_IDS", "").split(",") if group.strip()
)
gpu_key_name: str = os.environ.get("OPS_GPU_KEY_NAME", "")
gpu_ami_id: str = os.environ.get("OPS_GPU_AMI_ID", "")
gpu_instance_profile: str = os.environ.get("OPS_GPU_INSTANCE_PROFILE", "")
gpu_root_volume_gb: int = int(os.environ.get("OPS_GPU_ROOT_VOLUME_GB", "300"))
export_dir: Path = Path(os.environ.get("OPS_CSV_EXPORT_DIR", "/app/exports"))
log_dir: Path = Path(os.environ.get("OPS_LOG_DIR", "/app/logs"))
state_dir: Path = Path(os.environ.get("OPS_STATE_DIR", "/app/state"))
model_library_root: Path = Path(os.environ.get("OPS_MODEL_LIBRARY_ROOT", "/model-library"))
cloudflare_zone_name: str = os.environ.get("OPS_CLOUDFLARE_ZONE_NAME", "desineuron.in")
cloudflare_api_token: str = os.environ.get("OPS_CLOUDFLARE_API_TOKEN", "")
settings = Settings()

View File

@@ -0,0 +1,41 @@
from __future__ import annotations
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
from .config import settings
engine = create_engine(settings.database_url, pool_pre_ping=True)
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=False)
class Base(DeclarativeBase):
pass
def get_db():
db = SessionLocal()
try:
yield db
db.commit()
except Exception:
db.rollback()
raise
finally:
db.close()
@contextmanager
def session_scope():
session = SessionLocal()
try:
yield session
session.commit()
except Exception:
session.rollback()
raise
finally:
session.close()

View File

@@ -0,0 +1,598 @@
from __future__ import annotations
import os
from datetime import datetime, timedelta, timezone
from pathlib import Path
from botocore.exceptions import ClientError
from fastapi import Depends, FastAPI, Form, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from sqlalchemy import func, select
from sqlalchemy.orm import Session
from starlette.middleware.sessions import SessionMiddleware
from .aws_control import calculate_machine_cost, create_managed_instance, ensure_bucket, export_sessions_csv, hydrate_model, latest_market_price, seed_bucket_prefixes, start_service, stop_machine, stop_service, sync_instances, terminate_machine, upload_model_directory
from .config import settings
from .database import Base, engine, get_db, session_scope
from .models import AuditEvent, CsvExport, Job, Machine, MachineProfile, MarketSnapshot, ModelCatalog, RouteBinding, Session as RuntimeSession, SessionCost, User, WorkloadProfile
from .route_control import apply_route, remove_route
from .seed import seed_defaults
from .security import get_current_user, verify_password
app = FastAPI(title="Desineuron Ops Control Plane")
app.add_middleware(SessionMiddleware, secret_key=settings.session_secret)
template_dir = Path(__file__).parent / "templates"
static_dir = Path(__file__).parent / "static"
templates = Jinja2Templates(directory=str(template_dir))
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
def utcnow() -> datetime:
return datetime.now(timezone.utc)
def recent_totals(db: Session) -> dict:
now = utcnow()
day_start = now - timedelta(days=1)
month_start = now - timedelta(days=30)
day_total = db.scalar(
select(func.coalesce(func.sum(SessionCost.total_cost_usd), 0.0))
.join(RuntimeSession, RuntimeSession.id == SessionCost.session_id)
.where(SessionCost.calculated_at >= day_start)
)
month_total = db.scalar(
select(func.coalesce(func.sum(SessionCost.total_cost_usd), 0.0))
.join(RuntimeSession, RuntimeSession.id == SessionCost.session_id)
.where(SessionCost.calculated_at >= month_start)
)
return {
"last_24h_usd": round(float(day_total or 0.0), 4),
"last_30d_usd": round(float(month_total or 0.0), 4),
}
def pop_flash(request: Request) -> dict | None:
return request.session.pop("flash", None)
def set_flash(request: Request, level: str, message: str) -> None:
request.session["flash"] = {"level": level, "message": message}
def parse_tag_list(raw: str) -> list[str]:
return [item.strip() for item in raw.split(",") if item.strip()]
@app.on_event("startup")
def startup() -> None:
Base.metadata.create_all(bind=engine)
settings.export_dir.mkdir(parents=True, exist_ok=True)
settings.log_dir.mkdir(parents=True, exist_ok=True)
settings.state_dir.mkdir(parents=True, exist_ok=True)
with session_scope() as db:
seed_defaults(db)
if settings.bucket_name:
ensure_bucket(settings.bucket_name, settings.bucket_region)
seed_bucket_prefixes(settings.bucket_name)
@app.get("/", response_class=HTMLResponse)
def root(request: Request):
if request.session.get("username"):
return RedirectResponse("/dashboard", status_code=302)
return RedirectResponse("/login", status_code=302)
@app.get("/login", response_class=HTMLResponse)
def login_page(request: Request):
return templates.TemplateResponse("login.html", {"request": request, "error": None})
@app.post("/login", response_class=HTMLResponse)
def login(request: Request, username: str = Form(...), password: str = Form(...), db: Session = Depends(get_db)):
user = db.scalar(select(User).where(User.username == username, User.is_active.is_(True)))
if not user or not verify_password(password, user.password_hash):
return templates.TemplateResponse("login.html", {"request": request, "error": "Invalid credentials"}, status_code=401)
request.session["username"] = user.username
return RedirectResponse("/dashboard", status_code=302)
@app.get("/logout")
def logout(request: Request):
request.session.clear()
return RedirectResponse("/login", status_code=302)
@app.get("/dashboard", response_class=HTMLResponse)
def dashboard(request: Request, current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
profiles = db.scalars(select(MachineProfile).order_by(MachineProfile.name)).all()
workloads = db.scalars(select(WorkloadProfile).order_by(WorkloadProfile.name)).all()
models = db.scalars(select(ModelCatalog).order_by(ModelCatalog.model_key)).all()
routes = db.scalars(select(RouteBinding).order_by(RouteBinding.hostname)).all()
jobs = db.scalars(select(Job).order_by(Job.created_at.desc()).limit(20)).all()
sessions = db.scalars(select(RuntimeSession).order_by(RuntimeSession.started_at.desc()).limit(20)).all()
market_rows = db.scalars(select(MarketSnapshot).order_by(MarketSnapshot.observed_at.desc()).limit(100)).all()
audits = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(20)).all()
costs = []
total_hourly = 0.0
total_estimated = 0.0
for machine in machines:
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
machine_cost = calculate_machine_cost(machine, hourly_rate)
total_hourly += machine_cost["hourly_price_usd"]
total_estimated += machine_cost["total_cost_usd"]
costs.append((machine.aws_instance_id, machine_cost))
summary = {
"machine_count": len(machines),
"active_sessions": sum(1 for session in sessions if session.status == "active"),
"active_jobs": sum(1 for job in jobs if job.status in {"queued", "running"}),
"routes_active": sum(1 for route in routes if route.status == "active"),
"hourly_burn_usd": round(total_hourly, 4),
"fleet_estimated_cost_usd": round(total_estimated, 4),
**recent_totals(db),
}
return templates.TemplateResponse(
"index.html",
{
"request": request,
"user": current_user,
"machines": machines,
"profiles": profiles,
"workloads": workloads,
"models": models,
"routes": routes,
"jobs": jobs,
"sessions": sessions,
"market_rows": market_rows,
"audits": audits,
"costs": dict(costs),
"summary": summary,
"flash": pop_flash(request),
"bucket_name": settings.bucket_name,
"regions": settings.visible_regions,
},
)
@app.get("/api/markets/instances")
def get_markets(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
profiles = db.scalars(select(MachineProfile).order_by(MachineProfile.name)).all()
payload = []
for profile in profiles:
per_region = {}
for region in settings.visible_regions:
on_demand = db.scalar(
select(MarketSnapshot)
.where(MarketSnapshot.region == region, MarketSnapshot.instance_type == profile.instance_type, MarketSnapshot.lifecycle == "on-demand")
.order_by(MarketSnapshot.observed_at.desc())
)
spot = db.scalar(
select(MarketSnapshot)
.where(MarketSnapshot.region == region, MarketSnapshot.instance_type == profile.instance_type, MarketSnapshot.lifecycle == "spot")
.order_by(MarketSnapshot.observed_at.desc())
)
per_region[region] = {
"on_demand": on_demand.hourly_price_usd if on_demand else None,
"on_demand_available": bool(on_demand and on_demand.offering_available),
"spot": spot.hourly_price_usd if spot else None,
"spot_available": bool(spot and spot.offering_available),
"last_seen": max(
[stamp for stamp in [on_demand.observed_at if on_demand else None, spot.observed_at if spot else None] if stamp],
default=None,
),
}
payload.append(
{
"profile": profile.name,
"instance_type": profile.instance_type,
"gpu_label": profile.gpu_label,
"vcpu": profile.vcpu,
"memory_gib": profile.memory_gib,
"regions": per_region,
}
)
return payload
@app.get("/api/machines")
def get_machines(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
payload = []
for machine in machines:
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
payload.append(
{
"id": machine.id,
"aws_instance_id": machine.aws_instance_id,
"name": machine.name,
"region": machine.region,
"state": machine.state,
"instance_type": machine.instance_type,
"lifecycle": machine.lifecycle,
"public_ip": machine.public_ip,
"private_ip": machine.private_ip,
"cost": calculate_machine_cost(machine, hourly_rate),
}
)
return payload
@app.post("/api/machines/launch")
def launch_machine(request: Request, profile_name: str = Form(...), lifecycle: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
profile = db.scalar(select(MachineProfile).where(MachineProfile.name == profile_name))
if not profile:
raise HTTPException(status_code=404, detail="Profile not found")
job = Job(job_type="launch_machine", status="running", actor=current_user.username, payload={"profile_name": profile_name, "lifecycle": lifecycle}, started_at=utcnow())
db.add(job)
db.flush()
try:
session_row = create_managed_instance(db, profile, current_user.username, lifecycle)
except Exception as exc:
error_code = exc.response.get("Error", {}).get("Code") if isinstance(exc, ClientError) else exc.__class__.__name__
job.status = "failed"
job.finished_at = utcnow()
job.result = {"error": str(exc), "code": error_code}
db.add(AuditEvent(actor=current_user.username, action="launch_machine_failed", entity_type="profile", entity_id=profile.name, payload=job.result))
set_flash(request, "error", f"Launch failed for {profile.name}: {error_code}")
return RedirectResponse("/dashboard", status_code=302)
job.status = "completed"
job.session_id = session_row.id
job.finished_at = utcnow()
job.result = {"session_id": session_row.id}
set_flash(request, "success", f"Launched {profile.name} as {lifecycle}.")
return RedirectResponse("/dashboard", status_code=302)
@app.post("/api/machines/{machine_id}/stop")
def api_stop_machine(machine_id: int, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
machine = db.get(Machine, machine_id)
if not machine:
raise HTTPException(status_code=404, detail="Machine not found")
job = Job(job_type="stop_machine", status="running", actor=current_user.username, machine_id=machine_id, payload={"aws_instance_id": machine.aws_instance_id}, started_at=utcnow())
db.add(job)
stop_machine(db, machine, current_user.username)
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
if active_session:
active_session.status = "stopped"
active_session.ended_at = utcnow()
job.status = "completed"
job.finished_at = utcnow()
job.result = {"status": "stopping"}
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success", f"Stop requested for {machine.aws_instance_id}.")
return RedirectResponse("/dashboard", status_code=302)
return {"status": "stopping"}
@app.post("/api/machines/{machine_id}/terminate")
def api_terminate_machine(machine_id: int, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
machine = db.get(Machine, machine_id)
if not machine:
raise HTTPException(status_code=404, detail="Machine not found")
job = Job(job_type="terminate_machine", status="running", actor=current_user.username, machine_id=machine_id, payload={"aws_instance_id": machine.aws_instance_id}, started_at=utcnow())
db.add(job)
terminate_machine(db, machine, current_user.username)
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
if active_session:
active_session.status = "terminated"
active_session.ended_at = utcnow()
job.status = "completed"
job.finished_at = utcnow()
job.result = {"status": "terminating"}
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success", f"Terminate requested for {machine.aws_instance_id}.")
return RedirectResponse("/dashboard", status_code=302)
return {"status": "terminating"}
@app.post("/api/models/hydrate")
def api_hydrate_model(request: Request, machine_id: int = Form(...), model_key: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
machine = db.get(Machine, machine_id)
model = db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model_key))
if not machine or not model:
raise HTTPException(status_code=404, detail="Machine or model not found")
if not settings.bucket_name:
raise HTTPException(status_code=400, detail="Bucket is not configured")
job = Job(job_type="hydrate_model", status="running", actor=current_user.username, machine_id=machine_id, payload={"model_key": model_key}, started_at=utcnow())
db.add(job)
result = hydrate_model(machine, model.s3_prefix, current_user.username, settings.bucket_name)
db.add(AuditEvent(actor=current_user.username, action="hydrate_model", entity_type="machine", entity_id=machine.aws_instance_id, payload={"model_key": model.model_key, "result": result}))
job.status = "completed" if result.get("returncode") == 0 else "failed"
job.finished_at = utcnow()
job.result = result
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Hydration {'completed' if result.get('returncode') == 0 else 'failed'} for {model.label} on {machine.aws_instance_id}.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse(result)
@app.post("/api/models/register")
def api_register_model(
request: Request,
model_key: str = Form(...),
label: str = Form(...),
source_relative_path: str = Form(...),
workload_tags: str = Form(""),
compatibility_tags: str = Form(""),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
if not settings.bucket_name:
raise HTTPException(status_code=400, detail="Bucket is not configured")
job = Job(
job_type="register_model",
status="running",
actor=current_user.username,
payload={
"model_key": model_key,
"label": label,
"source_relative_path": source_relative_path,
"workload_tags": workload_tags,
"compatibility_tags": compatibility_tags,
},
started_at=utcnow(),
)
db.add(job)
try:
result = upload_model_directory(
settings.bucket_name,
model_key=model_key,
source_relative_path=source_relative_path,
label=label,
workload_tags=parse_tag_list(workload_tags),
compatibility_tags=parse_tag_list(compatibility_tags),
)
except Exception as exc:
job.status = "failed"
job.finished_at = utcnow()
job.result = {"error": str(exc)}
db.add(AuditEvent(actor=current_user.username, action="register_model_failed", entity_type="model", entity_id=model_key, payload=job.result))
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "error", f"Model ingest failed for {model_key}: {exc}")
return RedirectResponse("/dashboard", status_code=302)
raise HTTPException(status_code=500, detail=str(exc))
existing = db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model_key))
if existing:
existing.label = label
existing.s3_prefix = result["s3_prefix"]
existing.expected_manifest = result["manifest"]
existing.checksums = {entry["path"]: entry["sha256"] for entry in result["manifest"]["files"]}
existing.compatibility_tags = result["compatibility_tags"]
existing.workload_tags = result["workload_tags"]
existing.size_gb = round(result["manifest"]["total_size_bytes"] / (1024 ** 3), 3)
else:
db.add(
ModelCatalog(
model_key=model_key,
label=label,
s3_prefix=result["s3_prefix"],
expected_manifest=result["manifest"],
checksums={entry["path"]: entry["sha256"] for entry in result["manifest"]["files"]},
compatibility_tags=result["compatibility_tags"],
workload_tags=result["workload_tags"],
size_gb=round(result["manifest"]["total_size_bytes"] / (1024 ** 3), 3),
)
)
job.status = "completed"
job.finished_at = utcnow()
job.result = {"manifest_key": result["manifest_key"], "file_count": result["manifest"]["file_count"]}
db.add(AuditEvent(actor=current_user.username, action="register_model", entity_type="model", entity_id=model_key, payload=job.result))
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success", f"Model {model_key} uploaded to S3 and manifest stored.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse(job.result)
@app.post("/api/workloads/start")
def api_start_workload(request: Request, machine_id: int = Form(...), workload_name: str = Form(...), auto_route: bool = Form(False), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
machine = db.get(Machine, machine_id)
workload = db.scalar(select(WorkloadProfile).where(WorkloadProfile.name == workload_name))
if not machine or not workload:
raise HTTPException(status_code=404, detail="Machine or workload not found")
job = Job(job_type="start_workload", status="running", actor=current_user.username, machine_id=machine_id, payload={"workload_name": workload_name, "auto_route": auto_route}, started_at=utcnow())
db.add(job)
result = start_service(machine, workload.name)
route_result = None
if result.get("returncode") == 0 and auto_route and workload.route_hostname and workload.default_port and machine.private_ip:
route_result = apply_route(workload.route_hostname, "http", machine.private_ip, workload.default_port)
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == workload.route_hostname))
if existing:
existing.scheme = "http"
existing.target_host = machine.private_ip
existing.target_port = workload.default_port
existing.status = "active"
existing.details = {"managed_by": "ops_control_plane", "machine_id": machine.aws_instance_id}
else:
db.add(RouteBinding(hostname=workload.route_hostname, target_type="managed", target_host=machine.private_ip, target_port=workload.default_port, scheme="http", status="active", details={"managed_by": "ops_control_plane", "machine_id": machine.aws_instance_id}))
db.add(AuditEvent(actor=current_user.username, action="start_workload", entity_type="machine", entity_id=machine.aws_instance_id, payload={"workload": workload.name, "result": result}))
job.status = "completed" if result.get("returncode") == 0 else "failed"
job.finished_at = utcnow()
job.result = {"service": result, "route": route_result}
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Start workload {'completed' if result.get('returncode') == 0 else 'failed'} for {workload.name} on {machine.aws_instance_id}.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse({"service": result, "route": route_result})
@app.post("/api/workloads/{machine_id}/stop")
def api_stop_workload(machine_id: int, request: Request, workload_name: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
machine = db.get(Machine, machine_id)
if not machine:
raise HTTPException(status_code=404, detail="Machine not found")
job = Job(job_type="stop_workload", status="running", actor=current_user.username, machine_id=machine_id, payload={"workload_name": workload_name}, started_at=utcnow())
db.add(job)
result = stop_service(machine, workload_name)
db.add(AuditEvent(actor=current_user.username, action="stop_workload", entity_type="machine", entity_id=machine.aws_instance_id, payload={"workload": workload_name, "result": result}))
job.status = "completed" if result.get("returncode") == 0 else "failed"
job.finished_at = utcnow()
job.result = result
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Stop workload {'completed' if result.get('returncode') == 0 else 'failed'} for {workload_name} on {machine.aws_instance_id}.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse(result)
@app.post("/api/routes/map")
def api_map_route(request: Request, hostname: str = Form(...), scheme: str = Form(...), target_host: str = Form(...), target_port: int = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
job = Job(job_type="map_route", status="running", actor=current_user.username, payload={"hostname": hostname, "scheme": scheme, "target_host": target_host, "target_port": target_port}, started_at=utcnow())
db.add(job)
result = apply_route(hostname, scheme, target_host, target_port)
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == hostname))
if existing:
existing.scheme = scheme
existing.target_host = target_host
existing.target_port = target_port
existing.status = "active"
else:
db.add(RouteBinding(hostname=hostname, target_type="managed", target_host=target_host, target_port=target_port, scheme=scheme, status="active"))
db.add(AuditEvent(actor=current_user.username, action="map_route", entity_type="route", entity_id=hostname, payload=result))
job.status = "completed" if result.get("returncode") == 0 else "failed"
job.finished_at = utcnow()
job.result = result
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Route {'mapped' if result.get('returncode') == 0 else 'map failed'} for {hostname}.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse(result)
@app.post("/api/routes/unmap")
def api_unmap_route(request: Request, hostname: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
job = Job(job_type="unmap_route", status="running", actor=current_user.username, payload={"hostname": hostname}, started_at=utcnow())
db.add(job)
result = remove_route(hostname)
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == hostname))
if existing:
existing.status = "removed"
db.add(AuditEvent(actor=current_user.username, action="unmap_route", entity_type="route", entity_id=hostname, payload=result))
job.status = "completed" if result.get("returncode") == 0 else "failed"
job.finished_at = utcnow()
job.result = result
if "text/html" in request.headers.get("accept", ""):
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Route {'removed' if result.get('returncode') == 0 else 'removal failed'} for {hostname}.")
return RedirectResponse("/dashboard", status_code=302)
return JSONResponse(result)
@app.get("/api/markets/pricing")
def get_market_pricing(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
rows = db.scalars(select(MarketSnapshot).order_by(MarketSnapshot.observed_at.desc()).limit(200)).all()
return [
{
"region": row.region,
"instance_type": row.instance_type,
"lifecycle": row.lifecycle,
"offering_available": row.offering_available,
"hourly_price_usd": row.hourly_price_usd,
"observed_at": row.observed_at,
}
for row in rows
]
@app.get("/api/sessions")
def get_sessions(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
sessions = db.scalars(select(RuntimeSession).order_by(RuntimeSession.started_at.desc()).limit(200)).all()
payload = []
for session_row in sessions:
machine = db.get(Machine, session_row.machine_id) if session_row.machine_id else None
latest_cost = db.scalar(select(SessionCost).where(SessionCost.session_id == session_row.id).order_by(SessionCost.calculated_at.desc()))
payload.append(
{
"id": session_row.id,
"actor": session_row.actor,
"workload_name": session_row.workload_name,
"status": session_row.status,
"started_at": session_row.started_at,
"ended_at": session_row.ended_at,
"notes": session_row.notes,
"machine": machine.aws_instance_id if machine else None,
"cost": latest_cost.total_cost_usd if latest_cost else None,
"runtime_hours": latest_cost.runtime_hours if latest_cost else None,
}
)
return payload
@app.get("/api/costs")
def api_costs(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
machines = db.scalars(select(Machine)).all()
total = 0.0
items = []
for machine in machines:
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
cost = calculate_machine_cost(machine, hourly_rate)
total += cost["total_cost_usd"]
items.append({"machine": machine.aws_instance_id, **cost})
return {"machines": items, "total_estimated_cost_usd": round(total, 4), **recent_totals(db)}
@app.get("/api/models")
def api_models(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
models = db.scalars(select(ModelCatalog).order_by(ModelCatalog.model_key)).all()
return [
{
"model_key": model.model_key,
"label": model.label,
"s3_prefix": model.s3_prefix,
"size_gb": model.size_gb,
"workload_tags": model.workload_tags,
"compatibility_tags": model.compatibility_tags,
"file_count": (model.expected_manifest or {}).get("file_count", 0),
}
for model in models
]
@app.get("/api/audit")
def api_audit(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
events = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(100)).all()
return [
{
"actor": event.actor,
"action": event.action,
"entity_type": event.entity_type,
"entity_id": event.entity_id,
"payload": event.payload,
"created_at": event.created_at,
}
for event in events
]
@app.get("/api/jobs")
def api_jobs(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
jobs = db.scalars(select(Job).order_by(Job.created_at.desc()).limit(200)).all()
return [
{
"id": job.id,
"job_type": job.job_type,
"status": job.status,
"actor": job.actor,
"machine_id": job.machine_id,
"session_id": job.session_id,
"payload": job.payload,
"result": job.result,
"created_at": job.created_at,
"finished_at": job.finished_at,
}
for job in jobs
]
@app.get("/api/exports/csv")
def api_export_csv(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
target = settings.export_dir / "sessions_latest.csv"
export_sessions_csv(db, str(target))
db.add(CsvExport(actor=current_user.username, export_type="sessions", path=str(target), details={"format": "csv"}))
return {"path": str(target)}
if __name__ == "__main__":
import uvicorn
uvicorn.run("ops_control_plane.main:app", host="0.0.0.0", port=8080, reload=False)

View File

@@ -0,0 +1,192 @@
from __future__ import annotations
from datetime import datetime, timezone
from sqlalchemy import Boolean, DateTime, Float, ForeignKey, Integer, JSON, String, Text
from sqlalchemy.orm import Mapped, mapped_column, relationship
from .database import Base
def utcnow() -> datetime:
return datetime.now(timezone.utc)
class User(Base):
__tablename__ = "users"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
username: Mapped[str] = mapped_column(String(64), unique=True, index=True)
password_hash: Mapped[str] = mapped_column(String(255))
role: Mapped[str] = mapped_column(String(32), default="admin")
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
class MachineProfile(Base):
__tablename__ = "machine_profiles"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
name: Mapped[str] = mapped_column(String(64), unique=True)
region: Mapped[str] = mapped_column(String(32))
instance_type: Mapped[str] = mapped_column(String(32))
gpu_label: Mapped[str] = mapped_column(String(64))
vcpu: Mapped[int] = mapped_column(Integer)
memory_gib: Mapped[float] = mapped_column(Float)
preferred_lifecycle: Mapped[str] = mapped_column(String(16), default="spot")
launch_config: Mapped[dict] = mapped_column(JSON, default=dict)
intended_workloads: Mapped[list] = mapped_column(JSON, default=list)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
class MarketSnapshot(Base):
__tablename__ = "market_snapshots"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
region: Mapped[str] = mapped_column(String(32), index=True)
instance_type: Mapped[str] = mapped_column(String(32), index=True)
lifecycle: Mapped[str] = mapped_column(String(16), index=True)
offering_available: Mapped[bool] = mapped_column(Boolean, default=False)
hourly_price_usd: Mapped[float | None] = mapped_column(Float, nullable=True)
source: Mapped[str] = mapped_column(String(32), default="aws")
raw_payload: Mapped[dict] = mapped_column(JSON, default=dict)
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, index=True)
class Machine(Base):
__tablename__ = "machines"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
aws_instance_id: Mapped[str] = mapped_column(String(32), unique=True, index=True)
name: Mapped[str] = mapped_column(String(128))
region: Mapped[str] = mapped_column(String(32))
profile_name: Mapped[str | None] = mapped_column(String(64), nullable=True)
instance_type: Mapped[str] = mapped_column(String(32))
lifecycle: Mapped[str] = mapped_column(String(16))
state: Mapped[str] = mapped_column(String(32))
public_ip: Mapped[str | None] = mapped_column(String(64), nullable=True)
private_ip: Mapped[str | None] = mapped_column(String(64), nullable=True)
launch_time: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
volume_gb: Mapped[int] = mapped_column(Integer, default=0)
public_ipv4_attached: Mapped[bool] = mapped_column(Boolean, default=False)
details: Mapped[dict] = mapped_column(JSON, default=dict)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
class WorkloadProfile(Base):
__tablename__ = "workload_profiles"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
name: Mapped[str] = mapped_column(String(64), unique=True)
service_type: Mapped[str] = mapped_column(String(32))
model_requirements: Mapped[list] = mapped_column(JSON, default=list)
default_port: Mapped[int | None] = mapped_column(Integer, nullable=True)
start_command: Mapped[str | None] = mapped_column(Text, nullable=True)
stop_command: Mapped[str | None] = mapped_column(Text, nullable=True)
healthcheck_path: Mapped[str | None] = mapped_column(String(255), nullable=True)
route_hostname: Mapped[str | None] = mapped_column(String(255), nullable=True)
class Job(Base):
__tablename__ = "jobs"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
job_type: Mapped[str] = mapped_column(String(32), index=True)
status: Mapped[str] = mapped_column(String(32), index=True, default="queued")
payload: Mapped[dict] = mapped_column(JSON, default=dict)
result: Mapped[dict] = mapped_column(JSON, default=dict)
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
session_id: Mapped[int | None] = mapped_column(ForeignKey("sessions.id"), nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
class Session(Base):
__tablename__ = "sessions"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
workload_name: Mapped[str | None] = mapped_column(String(64), nullable=True)
status: Mapped[str] = mapped_column(String(32), default="active")
started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
ended_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
cost_records: Mapped[list["SessionCost"]] = relationship(back_populates="session")
class SessionCost(Base):
__tablename__ = "session_costs"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
session_id: Mapped[int] = mapped_column(ForeignKey("sessions.id"))
runtime_hours: Mapped[float] = mapped_column(Float, default=0.0)
compute_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
storage_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
public_ip_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
total_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
calculated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
session: Mapped[Session] = relationship(back_populates="cost_records")
class ModelCatalog(Base):
__tablename__ = "model_catalog"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
model_key: Mapped[str] = mapped_column(String(128), unique=True)
label: Mapped[str] = mapped_column(String(255))
s3_prefix: Mapped[str] = mapped_column(String(512))
expected_manifest: Mapped[dict] = mapped_column(JSON, default=dict)
checksums: Mapped[dict] = mapped_column(JSON, default=dict)
compatibility_tags: Mapped[list] = mapped_column(JSON, default=list)
workload_tags: Mapped[list] = mapped_column(JSON, default=list)
size_gb: Mapped[float | None] = mapped_column(Float, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
class MachineModelCache(Base):
__tablename__ = "machine_model_cache"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
machine_id: Mapped[int] = mapped_column(ForeignKey("machines.id"))
model_key: Mapped[str] = mapped_column(String(128))
status: Mapped[str] = mapped_column(String(32), default="pending")
path_on_machine: Mapped[str | None] = mapped_column(String(512), nullable=True)
hydrated_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
details: Mapped[dict] = mapped_column(JSON, default=dict)
class RouteBinding(Base):
__tablename__ = "route_bindings"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
hostname: Mapped[str] = mapped_column(String(255), unique=True)
target_type: Mapped[str] = mapped_column(String(32))
target_host: Mapped[str] = mapped_column(String(255))
target_port: Mapped[int] = mapped_column(Integer)
scheme: Mapped[str] = mapped_column(String(16), default="http")
status: Mapped[str] = mapped_column(String(32), default="active")
details: Mapped[dict] = mapped_column(JSON, default=dict)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
class ServiceState(Base):
__tablename__ = "service_states"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
service_name: Mapped[str] = mapped_column(String(64))
status: Mapped[str] = mapped_column(String(32))
details: Mapped[dict] = mapped_column(JSON, default=dict)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
class AuditEvent(Base):
__tablename__ = "audit_events"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
action: Mapped[str] = mapped_column(String(64))
entity_type: Mapped[str] = mapped_column(String(64))
entity_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
payload: Mapped[dict] = mapped_column(JSON, default=dict)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
class CsvExport(Base):
__tablename__ = "csv_exports"
id: Mapped[int] = mapped_column(Integer, primary_key=True)
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
export_type: Mapped[str] = mapped_column(String(64))
path: Mapped[str] = mapped_column(String(512))
details: Mapped[dict] = mapped_column(JSON, default=dict)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
import json
import subprocess
from .config import settings
def run_ingress_command(command: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=NUL",
"-i",
str(settings.ssh_key_path),
"-p",
str(settings.ingress_ssh_port),
f"{settings.ingress_ssh_user}@{settings.ingress_ssh_host}",
command,
],
capture_output=True,
text=True,
check=False,
)
def apply_route(hostname: str, scheme: str, target_host: str, target_port: int) -> dict:
payload = json.dumps(
{"hostname": hostname, "scheme": scheme, "target_host": target_host, "target_port": target_port}
)
result = run_ingress_command(
f"sudo {settings.ingress_route_helper} upsert '{payload}' && sudo systemctl reload caddy"
)
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
def remove_route(hostname: str) -> dict:
result = run_ingress_command(
f"sudo {settings.ingress_route_helper} delete {hostname} && sudo systemctl reload caddy"
)
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from fastapi import Depends, HTTPException, Request, status
from passlib.context import CryptContext
from sqlalchemy import select
from sqlalchemy.orm import Session
from .database import get_db
from .models import User
pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
def hash_password(password: str) -> str:
return pwd_context.hash(password)
def verify_password(password: str, password_hash: str) -> bool:
return pwd_context.verify(password, password_hash)
def get_current_user(request: Request, db: Session = Depends(get_db)) -> User:
username = request.session.get("username")
if not username:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
user = db.scalar(select(User).where(User.username == username, User.is_active.is_(True)))
if not user:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
return user

View File

@@ -0,0 +1,160 @@
from __future__ import annotations
import json
from sqlalchemy import select
from sqlalchemy.orm import Session
from .config import settings
from .models import MachineProfile, ModelCatalog, User, WorkloadProfile
from .security import hash_password
DEFAULT_MACHINE_PROFILES = [
{
"name": "t4g-micro-ingress",
"region": "us-east-1",
"instance_type": "t4g.micro",
"gpu_label": "Ingress CPU",
"vcpu": 2,
"memory_gib": 1.0,
"preferred_lifecycle": "on-demand",
"intended_workloads": ["ingress"],
},
{
"name": "g6-xlarge",
"region": "us-east-1",
"instance_type": "g6.xlarge",
"gpu_label": "1x NVIDIA L4",
"vcpu": 4,
"memory_gib": 16.0,
"preferred_lifecycle": "spot",
"intended_workloads": ["light-comfy", "qwen-edit"],
},
{
"name": "g6-2xlarge",
"region": "us-east-1",
"instance_type": "g6.2xlarge",
"gpu_label": "1x NVIDIA L4",
"vcpu": 8,
"memory_gib": 32.0,
"preferred_lifecycle": "spot",
"intended_workloads": ["comfyui", "qwen-edit"],
},
{
"name": "g6-4xlarge",
"region": "us-east-1",
"instance_type": "g6.4xlarge",
"gpu_label": "1x NVIDIA L4",
"vcpu": 16,
"memory_gib": 64.0,
"preferred_lifecycle": "spot",
"intended_workloads": ["comfyui", "wan-video", "qwen-edit"],
},
{
"name": "g6-12xlarge",
"region": "us-east-1",
"instance_type": "g6.12xlarge",
"gpu_label": "4x NVIDIA L4",
"vcpu": 48,
"memory_gib": 192.0,
"preferred_lifecycle": "spot",
"intended_workloads": ["comfyui", "batch-storyboard", "qwen-edit", "multi-gpu"],
},
]
DEFAULT_WORKLOADS = [
{
"name": "comfyui",
"service_type": "systemd",
"model_requirements": [],
"default_port": 8188,
"start_command": "sudo systemctl start comfyui",
"stop_command": "sudo systemctl stop comfyui",
"healthcheck_path": "/",
"route_hostname": "comfy.desineuron.in",
},
]
DEFAULT_MODELS = [
{
"model_key": "qwen-image-edit-2511",
"label": "Qwen Image Edit 2511",
"s3_prefix": "models/qwen-image-edit-2511/",
"compatibility_tags": ["qwen", "image-edit"],
"workload_tags": ["comfyui", "qwen-edit"],
},
{
"model_key": "qwen-image-2512",
"label": "Qwen Image 2512",
"s3_prefix": "models/qwen-image-2512/",
"compatibility_tags": ["qwen", "image"],
"workload_tags": ["comfyui", "qwen-image"],
},
]
def seed_defaults(db: Session) -> None:
if not db.scalar(select(User).where(User.username == settings.admin_username)):
db.add(
User(
username=settings.admin_username,
password_hash=hash_password(settings.admin_password),
role="admin",
)
)
try:
team_users = json.loads(settings.team_users_json)
except json.JSONDecodeError:
team_users = []
for row in team_users:
username = row.get("username")
password = row.get("password")
role = row.get("role", "operator")
if not username or not password:
continue
existing_user = db.scalar(select(User).where(User.username == username))
if existing_user:
existing_user.role = role
existing_user.is_active = True
if row.get("reset_password"):
existing_user.password_hash = hash_password(password)
continue
db.add(User(username=username, password_hash=hash_password(password), role=role))
for profile in DEFAULT_MACHINE_PROFILES:
existing = db.scalar(select(MachineProfile).where(MachineProfile.name == profile["name"]))
if existing:
existing.launch_config = {
"ami_id": settings.gpu_ami_id,
"subnet_id": settings.gpu_subnet_id,
"security_group_ids": list(settings.gpu_security_group_ids),
"key_name": settings.gpu_key_name,
"instance_profile": settings.gpu_instance_profile,
"root_volume_gb": settings.gpu_root_volume_gb,
}
continue
db.add(
MachineProfile(
**profile,
launch_config={
"ami_id": settings.gpu_ami_id,
"subnet_id": settings.gpu_subnet_id,
"security_group_ids": list(settings.gpu_security_group_ids),
"key_name": settings.gpu_key_name,
"instance_profile": settings.gpu_instance_profile,
"root_volume_gb": settings.gpu_root_volume_gb,
},
)
)
for workload in DEFAULT_WORKLOADS:
if not db.scalar(select(WorkloadProfile).where(WorkloadProfile.name == workload["name"])):
db.add(WorkloadProfile(**workload))
for model in DEFAULT_MODELS:
if not db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model["model_key"])):
db.add(ModelCatalog(**model))

View File

@@ -0,0 +1,209 @@
html{color-scheme:dark}
body{
font-family:Segoe UI,system-ui,sans-serif;
background:
radial-gradient(circle at top right, rgba(220,38,38,.18), transparent 28%),
radial-gradient(circle at left 20%, rgba(239,68,68,.09), transparent 24%),
linear-gradient(180deg, #020202 0%, #070707 100%);
color:#f5f5f5;
margin:0;
min-height:100vh;
}
.hud-grid{
position:fixed;
inset:0;
pointer-events:none;
background-image:
linear-gradient(rgba(255,255,255,.02) 1px, transparent 1px),
linear-gradient(90deg, rgba(255,255,255,.02) 1px, transparent 1px);
background-size:32px 32px;
mask-image:linear-gradient(180deg, rgba(0,0,0,.35), rgba(0,0,0,.85));
}
.topbar{
position:sticky;
top:0;
z-index:10;
display:flex;
justify-content:space-between;
align-items:center;
padding:22px 30px;
background:rgba(10,10,10,.9);
backdrop-filter:blur(18px);
border-bottom:1px solid rgba(255,255,255,.07);
box-shadow:0 10px 40px rgba(0,0,0,.4);
}
.topbar h1{
margin:0;
font-size:24px;
letter-spacing:.04em;
text-transform:uppercase;
}
.topbar p{
margin:5px 0 0;
color:#b8b8b8;
max-width:760px;
}
.topbar-actions{
display:flex;
gap:12px;
align-items:center;
}
.user-chip{
display:inline-flex;
align-items:center;
padding:8px 12px;
border:1px solid rgba(248,113,113,.45);
border-radius:999px;
color:#fca5a5;
background:rgba(127,29,29,.22);
box-shadow:0 0 24px rgba(220,38,38,.15) inset;
}
.topbar-actions a,.button,button{
display:inline-flex;
align-items:center;
justify-content:center;
gap:8px;
background:linear-gradient(180deg, #ef4444 0%, #991b1b 100%);
color:#fff;
border:1px solid rgba(248,113,113,.5);
border-radius:12px;
padding:10px 14px;
text-decoration:none;
cursor:pointer;
box-shadow:0 0 24px rgba(220,38,38,.18);
}
.button.secondary,button.secondary{
background:rgba(255,255,255,.04);
border-color:rgba(255,255,255,.14);
color:#fff;
box-shadow:none;
}
.button.danger,button.danger{
background:linear-gradient(180deg, #dc2626 0%, #7f1d1d 100%);
}
.page{
position:relative;
padding:26px;
}
.grid{display:grid;gap:20px}
.grid.two{grid-template-columns:repeat(2,minmax(0,1fr))}
.grid.three{grid-template-columns:repeat(3,minmax(0,1fr))}
.summary-grid{display:grid;grid-template-columns:repeat(4,minmax(0,1fr));gap:20px;margin-bottom:20px}
.card{
position:relative;
overflow:hidden;
background:linear-gradient(180deg, rgba(16,16,16,.88) 0%, rgba(8,8,8,.92) 100%);
border:1px solid rgba(255,255,255,.08);
border-radius:20px;
padding:22px;
margin-bottom:20px;
box-shadow:
0 16px 40px rgba(0,0,0,.45),
0 0 0 1px rgba(255,255,255,.02) inset;
}
.card::after{
content:"";
position:absolute;
inset:auto -20% -60% auto;
width:180px;
height:180px;
background:radial-gradient(circle, rgba(220,38,38,.16), transparent 65%);
pointer-events:none;
}
.card h2{
margin:0 0 16px;
font-size:18px;
letter-spacing:.04em;
text-transform:uppercase;
}
.card.narrow{max-width:460px;margin:90px auto}
.card.stat strong{
display:block;
font-size:30px;
margin:8px 0;
color:#fff;
}
.eyebrow{
color:#f87171;
font-size:11px;
letter-spacing:.18em;
text-transform:uppercase;
}
.flash{
display:flex;
gap:12px;
align-items:center;
}
.flash.success{
border-color:rgba(248,113,113,.35);
background:linear-gradient(180deg, rgba(127,29,29,.25) 0%, rgba(18,18,18,.95) 100%);
}
.flash.error{
border-color:rgba(248,113,113,.6);
background:linear-gradient(180deg, rgba(69,10,10,.55) 0%, rgba(18,18,18,.95) 100%);
}
.stack{display:grid;gap:12px}
.action-stack{display:grid;gap:8px}
.plain-list{padding-left:18px;margin:0;display:grid;gap:8px;color:#d6d6d6}
.kv-list{display:grid;gap:10px}
.kv-list div{display:flex;justify-content:space-between;gap:12px}
.checkbox-row{
display:flex;
align-items:center;
gap:10px;
color:#f5f5f5;
}
label{display:grid;gap:6px;color:#d0d0d0}
input,select{
padding:11px 12px;
border-radius:12px;
border:1px solid rgba(255,255,255,.12);
background:rgba(255,255,255,.03);
color:#fff;
outline:none;
}
input:focus,select:focus{
border-color:rgba(248,113,113,.75);
box-shadow:0 0 0 3px rgba(220,38,38,.16);
}
table{width:100%;border-collapse:collapse}
th,td{
padding:12px 10px;
border-bottom:1px solid rgba(255,255,255,.08);
text-align:left;
vertical-align:top;
}
th{
color:#fca5a5;
font-weight:600;
font-size:12px;
letter-spacing:.08em;
text-transform:uppercase;
}
.pill{
display:inline-block;
padding:4px 10px;
border-radius:999px;
font-size:12px;
background:rgba(255,255,255,.06);
color:#f3f3f3;
}
.pill.available{
background:rgba(127,29,29,.45);
color:#fecaca;
border:1px solid rgba(248,113,113,.3);
}
.pill.unavailable{
background:rgba(31,31,31,.9);
color:#d4d4d4;
}
.pill.unknown{
background:rgba(55,65,81,.5);
color:#e5e7eb;
}
.muted{color:#a3a3a3;font-size:12px}
.error{color:#fca5a5}
@media (max-width: 1100px){
.grid.two,.grid.three,.summary-grid{grid-template-columns:1fr}
}

View File

@@ -0,0 +1,27 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{{ title or "Desineuron Ops" }}</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<div class="hud-grid" aria-hidden="true"></div>
<header class="topbar">
<div>
<h1>Desineuron Ops Control Plane</h1>
<p>Linux-hosted AWS control surface for machines, models, routes, and cost</p>
</div>
{% if user %}
<div class="topbar-actions">
<span class="user-chip">{{ user.username }}</span>
<a href="/logout">Logout</a>
</div>
{% endif %}
</header>
<main class="page">
{% block content %}{% endblock %}
</main>
</body>
</html>

View File

@@ -0,0 +1,355 @@
{% extends "base.html" %}
{% block content %}
{% if flash %}
<section class="card flash {{ flash.level }}">
<strong>{{ flash.level|capitalize }}</strong>
<span>{{ flash.message }}</span>
</section>
{% endif %}
<section class="summary-grid">
<article class="card stat">
<span class="eyebrow">Machines</span>
<strong>{{ summary.machine_count }}</strong>
<span class="muted">Known AWS nodes</span>
</article>
<article class="card stat">
<span class="eyebrow">Hourly Burn</span>
<strong>${{ summary.hourly_burn_usd }}</strong>
<span class="muted">Estimated live blended hourly cost</span>
</article>
<article class="card stat">
<span class="eyebrow">24h Cost</span>
<strong>${{ summary.last_24h_usd }}</strong>
<span class="muted">Rolling 24 hour estimate</span>
</article>
<article class="card stat">
<span class="eyebrow">30d Cost</span>
<strong>${{ summary.last_30d_usd }}</strong>
<span class="muted">Rolling 30 day estimate</span>
</article>
</section>
<div class="grid three">
<section class="card">
<h2>Control Surface</h2>
<div class="kv-list">
<div><span>Bucket</span><strong>{{ bucket_name or "not configured" }}</strong></div>
<div><span>Visible regions</span><strong>{{ regions|join(", ") }}</strong></div>
<div><span>Active sessions</span><strong>{{ summary.active_sessions }}</strong></div>
<div><span>Active jobs</span><strong>{{ summary.active_jobs }}</strong></div>
<div><span>Active routes</span><strong>{{ summary.routes_active }}</strong></div>
<div><span>Fleet est. cost</span><strong>${{ summary.fleet_estimated_cost_usd }}</strong></div>
</div>
</section>
<section class="card">
<h2>Launch Machine</h2>
<form method="post" action="/api/machines/launch" class="stack">
<label>Profile
<select name="profile_name">
{% for profile in profiles %}
<option value="{{ profile.name }}">{{ profile.name }} | {{ profile.instance_type }} | {{ profile.gpu_label }}</option>
{% endfor %}
</select>
</label>
<label>Lifecycle
<select name="lifecycle">
<option value="spot">spot</option>
<option value="on-demand">on-demand</option>
</select>
</label>
<button type="submit">Launch Selected Machine</button>
</form>
</section>
<section class="card">
<h2>Runbooks</h2>
<ul class="plain-list">
<li>1. Launch preferred GPU profile.</li>
<li>2. Hydrate required model from S3.</li>
<li>3. Start workload and optionally map route.</li>
<li>4. Monitor runtime and estimated cost.</li>
<li>5. Stop or terminate the node when done.</li>
</ul>
<a class="button secondary" href="/api/exports/csv">Export Sessions CSV</a>
</section>
</div>
<section class="card">
<h2>Markets</h2>
<table>
<thead>
<tr>
<th>Profile</th>
<th>Instance</th>
<th>GPU</th>
<th>vCPU / RAM</th>
<th>Region</th>
<th>On-Demand</th>
<th>Spot</th>
<th>Preferred Use</th>
</tr>
</thead>
<tbody>
{% for profile in profiles %}
{% for region in regions %}
{% set ns = namespace(on_demand='-', on_demand_status='unknown', spot='-', spot_status='unknown') %}
{% for market in market_rows %}
{% if market.region == region and market.instance_type == profile.instance_type and market.lifecycle == 'on-demand' %}
{% set ns.on_demand = '$' ~ market.hourly_price_usd if market.hourly_price_usd is not none else '-' %}
{% set ns.on_demand_status = 'available' if market.offering_available else 'unavailable' %}
{% endif %}
{% if market.region == region and market.instance_type == profile.instance_type and market.lifecycle == 'spot' %}
{% set ns.spot = '$' ~ market.hourly_price_usd if market.hourly_price_usd is not none else '-' %}
{% set ns.spot_status = 'available' if market.offering_available else 'unavailable' %}
{% endif %}
{% endfor %}
<tr>
<td>{{ profile.name }}</td>
<td>{{ profile.instance_type }}</td>
<td>{{ profile.gpu_label }}</td>
<td>{{ profile.vcpu }} / {{ profile.memory_gib }} GiB</td>
<td>{{ region }}</td>
<td><span class="pill {{ ns.on_demand_status }}">{{ ns.on_demand }}</span></td>
<td><span class="pill {{ ns.spot_status }}">{{ ns.spot }}</span></td>
<td>{{ profile.intended_workloads|join(", ") }}</td>
</tr>
{% endfor %}
{% endfor %}
</tbody>
</table>
</section>
<section class="card">
<h2>Machines</h2>
<table>
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th>State</th>
<th>IPs</th>
<th>Runtime</th>
<th>Cost</th>
<th>Actions</th>
</tr>
</thead>
<tbody>
{% for machine in machines %}
<tr>
<td>
<strong>{{ machine.name }}</strong>
<div class="muted">{{ machine.aws_instance_id }}</div>
</td>
<td>
<div>{{ machine.instance_type }}</div>
<div class="muted">{{ machine.lifecycle }} / {{ machine.region }}</div>
</td>
<td>{{ machine.state }}</td>
<td>
<div>{{ machine.public_ip or "-" }}</div>
<div class="muted">{{ machine.private_ip or "-" }}</div>
</td>
<td>{{ costs[machine.aws_instance_id].runtime_hours if machine.aws_instance_id in costs else "-" }} h</td>
<td>
<div>${{ costs[machine.aws_instance_id].total_cost_usd if machine.aws_instance_id in costs else "-" }}</div>
<div class="muted">${{ costs[machine.aws_instance_id].hourly_price_usd if machine.aws_instance_id in costs else "-" }}/hr</div>
</td>
<td>
<div class="action-stack">
<form method="post" action="/api/machines/{{ machine.id }}/stop">
<button type="submit" class="button secondary">Stop</button>
</form>
<form method="post" action="/api/machines/{{ machine.id }}/terminate">
<button type="submit" class="button danger">Terminate</button>
</form>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
<div class="grid two">
<section class="card">
<h2>Model Library Ingest</h2>
<form method="post" action="/api/models/register" class="stack">
<label>Model Key <input type="text" name="model_key" placeholder="qwen-image-edit-2511" required></label>
<label>Label <input type="text" name="label" placeholder="Qwen Image Edit 2511" required></label>
<label>Source Path Under Linux Model Library <input type="text" name="source_relative_path" placeholder="Qwen-Image-Edit-2511" required></label>
<label>Workload Tags <input type="text" name="workload_tags" placeholder="comfyui, qwen-edit"></label>
<label>Compatibility Tags <input type="text" name="compatibility_tags" placeholder="qwen, image-edit"></label>
<button type="submit">Upload to S3 + Generate Manifest</button>
</form>
</section>
<section class="card">
<h2>Hydrate Model</h2>
<form method="post" action="/api/models/hydrate" class="stack">
<label>Machine
<select name="machine_id">
{% for machine in machines %}
<option value="{{ machine.id }}">{{ machine.name }} ({{ machine.aws_instance_id }})</option>
{% endfor %}
</select>
</label>
<label>Model
<select name="model_key">
{% for model in models %}
<option value="{{ model.model_key }}">{{ model.label }}</option>
{% endfor %}
</select>
</label>
<button type="submit">Hydrate from S3</button>
</form>
</section>
<section class="card">
<h2>Start Workload</h2>
<form method="post" action="/api/workloads/start" class="stack">
<label>Machine
<select name="machine_id">
{% for machine in machines %}
<option value="{{ machine.id }}">{{ machine.name }}</option>
{% endfor %}
</select>
</label>
<label>Workload
<select name="workload_name">
{% for workload in workloads %}
<option value="{{ workload.name }}">{{ workload.name }}</option>
{% endfor %}
</select>
</label>
<label class="checkbox-row"><input type="checkbox" name="auto_route" value="true"> Auto-map workload hostname via ingress</label>
<button type="submit">Start Workload</button>
</form>
</section>
</div>
<section class="card">
<h2>Registered Models</h2>
<table>
<thead>
<tr><th>Model</th><th>S3 Prefix</th><th>Size</th><th>Files</th><th>Tags</th></tr>
</thead>
<tbody>
{% for model in models %}
<tr>
<td>
<strong>{{ model.label }}</strong>
<div class="muted">{{ model.model_key }}</div>
</td>
<td>{{ model.s3_prefix }}</td>
<td>{{ model.size_gb or "-" }} GiB</td>
<td>{{ model.expected_manifest.file_count if model.expected_manifest else "-" }}</td>
<td>
<div>{{ model.workload_tags|join(", ") }}</div>
<div class="muted">{{ model.compatibility_tags|join(", ") }}</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
<div class="grid two">
<section class="card">
<h2>Route Management</h2>
<form method="post" action="/api/routes/map" class="stack">
<label>Hostname <input type="text" name="hostname" placeholder="gpu-ui.desineuron.in" required></label>
<label>Scheme
<select name="scheme">
<option value="http">http</option>
<option value="https">https</option>
</select>
</label>
<label>Target Host <input type="text" name="target_host" placeholder="172.31.x.x" required></label>
<label>Target Port <input type="number" name="target_port" value="8188" required></label>
<button type="submit">Map Route</button>
</form>
<table>
<thead>
<tr><th>Hostname</th><th>Target</th><th>Status</th><th>Action</th></tr>
</thead>
<tbody>
{% for route in routes %}
<tr>
<td>{{ route.hostname }}</td>
<td>{{ route.scheme }}://{{ route.target_host }}:{{ route.target_port }}</td>
<td>{{ route.status }}</td>
<td>
<form method="post" action="/api/routes/unmap">
<input type="hidden" name="hostname" value="{{ route.hostname }}">
<button type="submit" class="button secondary">Unmap</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
<section class="card">
<h2>Recent Sessions</h2>
<table>
<thead>
<tr><th>Actor</th><th>Workload</th><th>Status</th><th>Started</th></tr>
</thead>
<tbody>
{% for session in sessions %}
<tr>
<td>{{ session.actor }}</td>
<td>{{ session.workload_name }}</td>
<td>{{ session.status }}</td>
<td>{{ session.started_at }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
</div>
<div class="grid two">
<section class="card">
<h2>Recent Jobs</h2>
<table>
<thead>
<tr><th>ID</th><th>Type</th><th>Status</th><th>Actor</th><th>Created</th></tr>
</thead>
<tbody>
{% for job in jobs %}
<tr>
<td>{{ job.id }}</td>
<td>{{ job.job_type }}</td>
<td>{{ job.status }}</td>
<td>{{ job.actor or "-" }}</td>
<td>{{ job.created_at }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
<section class="card">
<h2>Audit</h2>
<table>
<thead>
<tr><th>Actor</th><th>Action</th><th>Entity</th><th>Time</th></tr>
</thead>
<tbody>
{% for event in audits %}
<tr>
<td>{{ event.actor or "-" }}</td>
<td>{{ event.action }}</td>
<td>{{ event.entity_type }} / {{ event.entity_id }}</td>
<td>{{ event.created_at }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</section>
</div>
{% endblock %}

View File

@@ -0,0 +1,14 @@
{% extends "base.html" %}
{% block content %}
<section class="card narrow">
<p class="eyebrow">Private Surface</p>
<h2>Login</h2>
<p class="muted">Use your Desineuron operator account.</p>
{% if error %}<p class="error">{{ error }}</p>{% endif %}
<form method="post" action="/login" class="stack">
<label>Email or username <input type="text" name="username" required></label>
<label>Password <input type="password" name="password" required></label>
<button type="submit">Enter Ops Console</button>
</form>
</section>
{% endblock %}

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
import time
from datetime import datetime, timedelta, timezone
from sqlalchemy import select
from .aws_control import latest_market_price, refresh_market_snapshots, sync_instances, upsert_session_cost
from .database import Base, engine, session_scope
from .models import Machine, MachineProfile, Session as RuntimeSession
from .seed import seed_defaults
def run_worker() -> None:
Base.metadata.create_all(bind=engine)
last_market_refresh: datetime | None = None
while True:
with session_scope() as db:
seed_defaults(db)
profiles = db.scalars(select(MachineProfile)).all()
sync_instances(db, {profile.region for profile in profiles})
running_machines = db.scalars(select(Machine).where(Machine.state == "running")).all()
for machine in running_machines:
active_session = db.scalar(
select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active")
)
if not active_session:
db.add(
RuntimeSession(
machine_id=machine.id,
actor="system-import",
workload_name=machine.profile_name or machine.instance_type,
status="active",
notes="Imported from existing running machine state",
)
)
if last_market_refresh is None or datetime.now(timezone.utc) - last_market_refresh > timedelta(minutes=15):
refresh_market_snapshots(db, {profile.region for profile in profiles}, profiles)
last_market_refresh = datetime.now(timezone.utc)
sessions = db.scalars(select(RuntimeSession).where(RuntimeSession.status == "active")).all()
for session_row in sessions:
if session_row.machine_id:
machine = db.get(Machine, session_row.machine_id)
if machine:
upsert_session_cost(db, session_row, machine)
time.sleep(60)
if __name__ == "__main__":
run_worker()

View File

@@ -0,0 +1,13 @@
fastapi==0.116.1
uvicorn[standard]==0.35.0
sqlalchemy==2.0.43
psycopg[binary]==3.2.10
jinja2==3.1.6
python-multipart==0.0.20
itsdangerous==2.2.0
passlib[bcrypt]==1.7.4
boto3==1.40.35
httpx==0.28.1
typer==0.16.1
python-dateutil==2.9.0.post0

View File

@@ -0,0 +1,58 @@
services:
ops-db:
image: postgres:16-alpine
container_name: desineuron-ops-db
environment:
POSTGRES_DB: ${OPS_DB_NAME}
POSTGRES_USER: ${OPS_DB_USER}
POSTGRES_PASSWORD: ${OPS_DB_PASSWORD}
ports:
- "5435:5432"
volumes:
- ./data/postgres:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${OPS_DB_USER} -d ${OPS_DB_NAME}"]
interval: 10s
timeout: 5s
retries: 10
restart: unless-stopped
ops-api:
build:
context: ./app
container_name: desineuron-ops-api
command: ["python", "-m", "ops_control_plane.main"]
env_file:
- .env
environment:
OPS_ROLE: api
ports:
- "18765:8080"
depends_on:
ops-db:
condition: service_healthy
volumes:
- ./exports:/app/exports
- ./logs:/app/logs
- ./state:/app/state
- ${OPS_MODEL_LIBRARY_HOST_PATH:-/mnt/ServerStorage/ai-models/models}:/model-library:ro
restart: unless-stopped
ops-worker:
build:
context: ./app
container_name: desineuron-ops-worker
command: ["python", "-m", "ops_control_plane.worker"]
env_file:
- .env
environment:
OPS_ROLE: worker
depends_on:
ops-db:
condition: service_healthy
volumes:
- ./exports:/app/exports
- ./logs:/app/logs
- ./state:/app/state
- ${OPS_MODEL_LIBRARY_HOST_PATH:-/mnt/ServerStorage/ai-models/models}:/model-library:ro
restart: unless-stopped

View File

@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -euo pipefail
sudo mkdir -p /etc/caddy/managed
sudo install -m 0755 /tmp/manage_desineuron_routes.py /usr/local/bin/manage_desineuron_routes.py
sudo install -m 0644 /tmp/desineuron_ingress_Caddyfile /etc/caddy/Caddyfile
sudo python3 /usr/local/bin/manage_desineuron_routes.py list >/dev/null
sudo caddy validate --config /etc/caddy/Caddyfile
sudo systemctl reload caddy

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -euo pipefail
TARGET_ROOT=/opt/desineuron-ops-control-plane
SERVICE_FILE=/etc/systemd/system/desineuron-ops-control-plane.service
sudo mkdir -p "$TARGET_ROOT"
sudo mkdir -p "$TARGET_ROOT/data/postgres" "$TARGET_ROOT/exports" "$TARGET_ROOT/logs" "$TARGET_ROOT/state"
sudo rsync -a \
--exclude '.env' \
--exclude 'data/' \
--exclude 'exports/' \
--exclude 'logs/' \
--exclude 'state/' \
/tmp/desineuron_ops_control_plane/ "$TARGET_ROOT/"
sudo chown -R "$USER:$USER" "$TARGET_ROOT"
if [[ ! -f "$TARGET_ROOT/.env" ]]; then
cp "$TARGET_ROOT/.env.example" "$TARGET_ROOT/.env"
fi
chmod 600 "$TARGET_ROOT/.env"
if [[ ! -f "$TARGET_ROOT/state/desineuron-l4-node.pem" ]]; then
echo "Missing $TARGET_ROOT/state/desineuron-l4-node.pem" >&2
exit 1
fi
chmod 600 "$TARGET_ROOT/state/desineuron-l4-node.pem"
mkdir -p "$TARGET_ROOT/data/postgres" "$TARGET_ROOT/exports" "$TARGET_ROOT/logs" "$TARGET_ROOT/state"
sudo chown -R 999:999 "$TARGET_ROOT/data/postgres" || true
sudo tee "$SERVICE_FILE" >/dev/null <<EOF
[Unit]
Description=Desineuron Ops Control Plane
After=docker.service network-online.target
Requires=docker.service
[Service]
Type=oneshot
RemainAfterExit=yes
WorkingDirectory=$TARGET_ROOT
ExecStart=/usr/bin/docker compose up -d --build
ExecStop=/usr/bin/docker compose down
TimeoutStartSec=0
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now desineuron-ops-control-plane.service
sudo systemctl --no-pager --full status desineuron-ops-control-plane.service

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
TARGET=/etc/nginx/sites-available/desineuron-ops-control-plane.conf
LINK=/etc/nginx/sites-enabled/desineuron-ops-control-plane.conf
sudo tee "$TARGET" >/dev/null <<'EOF'
server {
listen 443 ssl http2;
listen [::]:443 ssl http2;
server_name ops.desineuron.in;
ssl_certificate /etc/letsencrypt/live/desineuron-infra/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/desineuron-infra/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
client_max_body_size 128m;
location / {
proxy_pass http://127.0.0.1:18765;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_read_timeout 3600;
proxy_send_timeout 3600;
}
}
EOF
sudo ln -sf "$TARGET" "$LINK"
sudo nginx -t
sudo systemctl reload nginx

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import sys
from pathlib import Path
STATE_FILE = Path("/etc/caddy/managed/desineuron-routes.json")
SNIPPET_FILE = Path("/etc/caddy/managed/desineuron-routes.caddy")
def load_routes() -> dict[str, dict]:
if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
return {}
def save_routes(routes: dict[str, dict]) -> None:
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
STATE_FILE.write_text(json.dumps(routes, indent=2), encoding="utf-8")
def render_routes(routes: dict[str, dict]) -> None:
lines: list[str] = []
for hostname, route in sorted(routes.items()):
lines.extend(
[
f"{hostname} {{",
"\tlog {",
"\t\toutput file /var/log/caddy/access.log",
"\t\tformat json",
"\t}",
f"\treverse_proxy {route['scheme']}://{route['target_host']}:{route['target_port']} {{",
"\t\theader_up Host {host}",
"\t\theader_up X-Forwarded-Host {host}",
"\t\theader_up X-Forwarded-Proto {scheme}",
"\t\theader_up X-Forwarded-For {remote_host}",
"\t}",
"}",
"",
]
)
SNIPPET_FILE.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
def main() -> int:
if len(sys.argv) < 2:
print("usage: manage_desineuron_routes.py <upsert|delete|list> [payload|hostname]")
return 1
command = sys.argv[1]
routes = load_routes()
if command == "upsert":
payload = json.loads(sys.argv[2])
routes[payload["hostname"]] = payload
save_routes(routes)
render_routes(routes)
print(json.dumps({"status": "ok", "action": "upsert", "hostname": payload["hostname"]}))
return 0
if command == "delete":
hostname = sys.argv[2]
routes.pop(hostname, None)
save_routes(routes)
render_routes(routes)
print(json.dumps({"status": "ok", "action": "delete", "hostname": hostname}))
return 0
if command == "list":
print(json.dumps(routes, indent=2))
return 0
print(f"unknown command: {command}")
return 1
if __name__ == "__main__":
raise SystemExit(main())