Built the Sentinel Tab
This commit is contained in:
16
infrastructure/ops_control_plane/app/Dockerfile
Normal file
16
infrastructure/ops_control_plane/app/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends openssh-client curl ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --no-cache-dir -r /app/requirements.txt
|
||||
|
||||
COPY ops_control_plane /app/ops_control_plane
|
||||
|
||||
CMD ["python", "-m", "ops_control_plane.main"]
|
||||
@@ -0,0 +1 @@
|
||||
__all__ = ["main"]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,549 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import io
|
||||
import json
|
||||
import shlex
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from collections.abc import Iterable
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .config import settings
|
||||
from .models import AuditEvent, Machine, MachineModelCache, MachineProfile, MarketSnapshot, ModelCatalog, RouteBinding, Session as RuntimeSession, SessionCost
|
||||
|
||||
|
||||
REGION_LOCATION_MAP = {
|
||||
"us-east-1": "US East (N. Virginia)",
|
||||
"ap-south-1": "Asia Pacific (Mumbai)",
|
||||
"eu-west-1": "EU (Ireland)",
|
||||
}
|
||||
|
||||
ON_DEMAND_PRICE_FALLBACKS = {
|
||||
("us-east-1", "t4g.micro"): 0.0084,
|
||||
}
|
||||
|
||||
|
||||
def utcnow() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def ec2_client(region: str):
|
||||
return boto3.client("ec2", region_name=region)
|
||||
|
||||
|
||||
def pricing_client():
|
||||
return boto3.client("pricing", region_name="us-east-1")
|
||||
|
||||
|
||||
def s3_client(region: str | None = None):
|
||||
return boto3.client("s3", region_name=region or settings.bucket_region)
|
||||
|
||||
|
||||
def ensure_bucket(bucket_name: str, region: str) -> None:
|
||||
client = s3_client(region)
|
||||
try:
|
||||
client.head_bucket(Bucket=bucket_name)
|
||||
except ClientError as exc:
|
||||
code = exc.response.get("Error", {}).get("Code", "")
|
||||
if code in {"404", "NoSuchBucket", "NotFound"}:
|
||||
if region == "us-east-1":
|
||||
client.create_bucket(Bucket=bucket_name)
|
||||
else:
|
||||
client.create_bucket(
|
||||
Bucket=bucket_name,
|
||||
CreateBucketConfiguration={"LocationConstraint": region},
|
||||
)
|
||||
elif code not in {"301", "403"}:
|
||||
raise
|
||||
client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})
|
||||
client.put_bucket_encryption(
|
||||
Bucket=bucket_name,
|
||||
ServerSideEncryptionConfiguration={
|
||||
"Rules": [{"ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}}]
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def seed_bucket_prefixes(bucket_name: str) -> None:
|
||||
client = s3_client()
|
||||
for prefix in [
|
||||
"models/",
|
||||
"workflows/",
|
||||
"references/",
|
||||
"outputs/",
|
||||
"manifests/",
|
||||
"bootstrap/",
|
||||
]:
|
||||
client.put_object(Bucket=bucket_name, Key=prefix)
|
||||
|
||||
|
||||
def resolve_model_source_dir(source_relative_path: str) -> Path:
|
||||
source = (settings.model_library_root / source_relative_path).resolve()
|
||||
root = settings.model_library_root.resolve()
|
||||
if root not in source.parents and source != root:
|
||||
raise ValueError("Model source path escapes configured model library root")
|
||||
if not source.exists() or not source.is_dir():
|
||||
raise FileNotFoundError(f"Model source directory not found: {source}")
|
||||
return source
|
||||
|
||||
|
||||
def build_model_manifest(source_dir: Path) -> dict:
|
||||
files: list[dict] = []
|
||||
total_size = 0
|
||||
for path in sorted(p for p in source_dir.rglob("*") if p.is_file()):
|
||||
rel = path.relative_to(source_dir).as_posix()
|
||||
sha256 = hashlib.sha256()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
sha256.update(chunk)
|
||||
size_bytes = path.stat().st_size
|
||||
total_size += size_bytes
|
||||
files.append({"path": rel, "sha256": sha256.hexdigest(), "size_bytes": size_bytes})
|
||||
return {
|
||||
"generated_at": utcnow().isoformat(),
|
||||
"file_count": len(files),
|
||||
"total_size_bytes": total_size,
|
||||
"files": files,
|
||||
}
|
||||
|
||||
|
||||
def upload_model_directory(bucket_name: str, model_key: str, source_relative_path: str, label: str, workload_tags: list[str] | None = None, compatibility_tags: list[str] | None = None) -> dict:
|
||||
source_dir = resolve_model_source_dir(source_relative_path)
|
||||
manifest = build_model_manifest(source_dir)
|
||||
client = s3_client()
|
||||
s3_prefix = f"models/{model_key}/"
|
||||
for file_entry in manifest["files"]:
|
||||
local_path = source_dir / Path(file_entry["path"])
|
||||
client.upload_file(str(local_path), bucket_name, s3_prefix + file_entry["path"])
|
||||
manifest_key = f"manifests/models/{model_key}.json"
|
||||
client.put_object(
|
||||
Bucket=bucket_name,
|
||||
Key=manifest_key,
|
||||
Body=json.dumps(manifest, indent=2).encode("utf-8"),
|
||||
ContentType="application/json",
|
||||
)
|
||||
return {
|
||||
"model_key": model_key,
|
||||
"label": label,
|
||||
"source_dir": str(source_dir),
|
||||
"s3_prefix": s3_prefix,
|
||||
"manifest_key": manifest_key,
|
||||
"manifest": manifest,
|
||||
"workload_tags": workload_tags or [],
|
||||
"compatibility_tags": compatibility_tags or [],
|
||||
}
|
||||
|
||||
|
||||
def fetch_on_demand_price(region: str, instance_type: str) -> float | None:
|
||||
location = REGION_LOCATION_MAP.get(region)
|
||||
if not location:
|
||||
return None
|
||||
response = pricing_client().get_products(
|
||||
ServiceCode="AmazonEC2",
|
||||
Filters=[
|
||||
{"Type": "TERM_MATCH", "Field": "instanceType", "Value": instance_type},
|
||||
{"Type": "TERM_MATCH", "Field": "location", "Value": location},
|
||||
{"Type": "TERM_MATCH", "Field": "operatingSystem", "Value": "Linux"},
|
||||
{"Type": "TERM_MATCH", "Field": "tenancy", "Value": "Shared"},
|
||||
{"Type": "TERM_MATCH", "Field": "preInstalledSw", "Value": "NA"},
|
||||
{"Type": "TERM_MATCH", "Field": "capacitystatus", "Value": "Used"},
|
||||
],
|
||||
MaxResults=1,
|
||||
)
|
||||
for price_item in response.get("PriceList", []):
|
||||
item = json.loads(price_item)
|
||||
terms = item.get("terms", {}).get("OnDemand", {})
|
||||
for term in terms.values():
|
||||
for dimension in term.get("priceDimensions", {}).values():
|
||||
price = dimension.get("pricePerUnit", {}).get("USD")
|
||||
if price:
|
||||
return float(price)
|
||||
return ON_DEMAND_PRICE_FALLBACKS.get((region, instance_type))
|
||||
|
||||
|
||||
def refresh_market_snapshots(db: Session, regions: Iterable[str], profile_rows: Iterable[MachineProfile]) -> None:
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for profile in profile_rows:
|
||||
seen.add((profile.region, profile.instance_type))
|
||||
|
||||
for region in regions:
|
||||
region_profiles = [p for p in profile_rows if p.region == region]
|
||||
instance_types = {p.instance_type for p in region_profiles}
|
||||
if not instance_types:
|
||||
continue
|
||||
ec2 = ec2_client(region)
|
||||
offerings = ec2.describe_instance_type_offerings(
|
||||
LocationType="region",
|
||||
Filters=[{"Name": "instance-type", "Values": sorted(instance_types)}],
|
||||
)["InstanceTypeOfferings"]
|
||||
available = {item["InstanceType"] for item in offerings}
|
||||
for instance_type in instance_types:
|
||||
on_demand_price = fetch_on_demand_price(region, instance_type)
|
||||
db.add(
|
||||
MarketSnapshot(
|
||||
region=region,
|
||||
instance_type=instance_type,
|
||||
lifecycle="on-demand",
|
||||
offering_available=instance_type in available,
|
||||
hourly_price_usd=on_demand_price,
|
||||
raw_payload={"instance_type": instance_type, "region": region},
|
||||
)
|
||||
)
|
||||
try:
|
||||
spot_history = ec2.describe_spot_price_history(
|
||||
InstanceTypes=[instance_type],
|
||||
ProductDescriptions=["Linux/UNIX"],
|
||||
StartTime=utcnow(),
|
||||
MaxResults=1,
|
||||
)["SpotPriceHistory"]
|
||||
spot_price = float(spot_history[0]["SpotPrice"]) if spot_history else None
|
||||
except ClientError:
|
||||
spot_price = None
|
||||
db.add(
|
||||
MarketSnapshot(
|
||||
region=region,
|
||||
instance_type=instance_type,
|
||||
lifecycle="spot",
|
||||
offering_available=instance_type in available and spot_price is not None,
|
||||
hourly_price_usd=spot_price,
|
||||
raw_payload={"instance_type": instance_type, "region": region},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def latest_market_price(db: Session, region: str, instance_type: str, lifecycle: str) -> float:
|
||||
row = db.scalar(
|
||||
select(MarketSnapshot)
|
||||
.where(
|
||||
MarketSnapshot.region == region,
|
||||
MarketSnapshot.instance_type == instance_type,
|
||||
MarketSnapshot.lifecycle == lifecycle,
|
||||
)
|
||||
.order_by(MarketSnapshot.observed_at.desc())
|
||||
)
|
||||
return row.hourly_price_usd if row and row.hourly_price_usd is not None else 0.0
|
||||
|
||||
|
||||
def sync_instances(db: Session, regions: Iterable[str]) -> None:
|
||||
for region in regions:
|
||||
ec2 = ec2_client(region)
|
||||
reservations = ec2.describe_instances()["Reservations"]
|
||||
for reservation in reservations:
|
||||
for instance in reservation["Instances"]:
|
||||
instance_id = instance["InstanceId"]
|
||||
launch_time = instance.get("LaunchTime")
|
||||
if launch_time and launch_time.tzinfo is None:
|
||||
launch_time = launch_time.replace(tzinfo=timezone.utc)
|
||||
public_ip = instance.get("PublicIpAddress")
|
||||
private_ip = instance.get("PrivateIpAddress")
|
||||
state_name = instance["State"]["Name"]
|
||||
volume_size = 0
|
||||
if instance.get("BlockDeviceMappings"):
|
||||
try:
|
||||
volume_ids = [b["Ebs"]["VolumeId"] for b in instance["BlockDeviceMappings"] if "Ebs" in b]
|
||||
if volume_ids:
|
||||
volumes = ec2.describe_volumes(VolumeIds=volume_ids)["Volumes"]
|
||||
volume_size = sum(v.get("Size", 0) for v in volumes)
|
||||
except ClientError:
|
||||
volume_size = 0
|
||||
existing = db.scalar(select(Machine).where(Machine.aws_instance_id == instance_id))
|
||||
tags = {tag["Key"]: tag["Value"] for tag in instance.get("Tags", [])}
|
||||
payload = {
|
||||
"key_name": instance.get("KeyName"),
|
||||
"subnet_id": instance.get("SubnetId"),
|
||||
"security_groups": instance.get("SecurityGroups", []),
|
||||
"image_id": instance.get("ImageId"),
|
||||
"iam_instance_profile": instance.get("IamInstanceProfile", {}).get("Arn"),
|
||||
"availability_zone": instance.get("Placement", {}).get("AvailabilityZone"),
|
||||
"public_dns": instance.get("PublicDnsName"),
|
||||
}
|
||||
if existing:
|
||||
existing.name = tags.get("Name", instance_id)
|
||||
existing.region = region
|
||||
existing.instance_type = instance["InstanceType"]
|
||||
existing.lifecycle = instance.get("InstanceLifecycle", "on-demand")
|
||||
existing.state = state_name
|
||||
existing.public_ip = public_ip
|
||||
existing.private_ip = private_ip
|
||||
existing.launch_time = launch_time
|
||||
existing.volume_gb = volume_size
|
||||
existing.public_ipv4_attached = bool(public_ip)
|
||||
existing.details = payload
|
||||
else:
|
||||
db.add(
|
||||
Machine(
|
||||
aws_instance_id=instance_id,
|
||||
name=tags.get("Name", instance_id),
|
||||
region=region,
|
||||
profile_name=tags.get("DesineuronProfile"),
|
||||
instance_type=instance["InstanceType"],
|
||||
lifecycle=instance.get("InstanceLifecycle", "on-demand"),
|
||||
state=state_name,
|
||||
public_ip=public_ip,
|
||||
private_ip=private_ip,
|
||||
launch_time=launch_time,
|
||||
volume_gb=volume_size,
|
||||
public_ipv4_attached=bool(public_ip),
|
||||
details=payload,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def candidate_subnet_ids(region: str, preferred_subnet_id: str) -> list[str]:
|
||||
if not preferred_subnet_id:
|
||||
return []
|
||||
ec2 = ec2_client(region)
|
||||
subnet_response = ec2.describe_subnets(SubnetIds=[preferred_subnet_id])["Subnets"]
|
||||
if not subnet_response:
|
||||
return [preferred_subnet_id]
|
||||
preferred = subnet_response[0]
|
||||
vpc_id = preferred["VpcId"]
|
||||
subnets = ec2.describe_subnets(
|
||||
Filters=[
|
||||
{"Name": "vpc-id", "Values": [vpc_id]},
|
||||
{"Name": "state", "Values": ["available"]},
|
||||
]
|
||||
)["Subnets"]
|
||||
ranked: list[tuple[int, str, str]] = []
|
||||
for subnet in subnets:
|
||||
subnet_id = subnet["SubnetId"]
|
||||
az = subnet.get("AvailabilityZone", "")
|
||||
score = 2
|
||||
if subnet_id == preferred_subnet_id:
|
||||
score = 0
|
||||
elif subnet.get("MapPublicIpOnLaunch"):
|
||||
score = 1
|
||||
ranked.append((score, az, subnet_id))
|
||||
return [subnet_id for _, _, subnet_id in sorted(ranked)]
|
||||
|
||||
|
||||
def calculate_machine_cost(machine: Machine, hourly_rate: float) -> dict:
|
||||
if not machine.launch_time:
|
||||
runtime_hours = 0.0
|
||||
else:
|
||||
runtime_hours = max((utcnow() - machine.launch_time).total_seconds() / 3600.0, 0.0)
|
||||
compute_cost = runtime_hours * hourly_rate
|
||||
storage_hourly = (machine.volume_gb * settings.ebs_gp3_per_gb_month) / 730.0
|
||||
storage_cost = runtime_hours * storage_hourly
|
||||
public_ip_cost = runtime_hours * settings.public_ipv4_per_hour if machine.public_ipv4_attached else 0.0
|
||||
return {
|
||||
"runtime_hours": round(runtime_hours, 3),
|
||||
"compute_cost_usd": round(compute_cost, 4),
|
||||
"storage_cost_usd": round(storage_cost, 4),
|
||||
"public_ip_cost_usd": round(public_ip_cost, 4),
|
||||
"total_cost_usd": round(compute_cost + storage_cost + public_ip_cost, 4),
|
||||
"hourly_price_usd": round(hourly_rate + storage_hourly + (settings.public_ipv4_per_hour if machine.public_ipv4_attached else 0.0), 4),
|
||||
}
|
||||
|
||||
|
||||
def upsert_session_cost(db: Session, session_row: RuntimeSession, machine: Machine) -> None:
|
||||
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle or "on-demand")
|
||||
cost_payload = calculate_machine_cost(machine, hourly_rate)
|
||||
record = db.scalar(
|
||||
select(SessionCost).where(SessionCost.session_id == session_row.id).order_by(SessionCost.calculated_at.desc())
|
||||
)
|
||||
if record:
|
||||
record.runtime_hours = cost_payload["runtime_hours"]
|
||||
record.compute_cost_usd = cost_payload["compute_cost_usd"]
|
||||
record.storage_cost_usd = cost_payload["storage_cost_usd"]
|
||||
record.public_ip_cost_usd = cost_payload["public_ip_cost_usd"]
|
||||
record.total_cost_usd = cost_payload["total_cost_usd"]
|
||||
record.calculated_at = utcnow()
|
||||
else:
|
||||
db.add(SessionCost(session_id=session_row.id, **cost_payload))
|
||||
|
||||
|
||||
def create_managed_instance(db: Session, profile: MachineProfile, actor: str, lifecycle: str) -> RuntimeSession:
|
||||
ec2 = ec2_client(profile.region)
|
||||
launch_config = profile.launch_config
|
||||
base_run_args = {
|
||||
"ImageId": launch_config["ami_id"],
|
||||
"InstanceType": profile.instance_type,
|
||||
"SecurityGroupIds": launch_config["security_group_ids"],
|
||||
"KeyName": launch_config["key_name"],
|
||||
"IamInstanceProfile": {"Name": launch_config["instance_profile"]},
|
||||
"MinCount": 1,
|
||||
"MaxCount": 1,
|
||||
"BlockDeviceMappings": [
|
||||
{
|
||||
"DeviceName": "/dev/sda1",
|
||||
"Ebs": {
|
||||
"VolumeSize": int(launch_config.get("root_volume_gb", settings.gpu_root_volume_gb)),
|
||||
"VolumeType": "gp3",
|
||||
"DeleteOnTermination": True,
|
||||
},
|
||||
}
|
||||
],
|
||||
"TagSpecifications": [
|
||||
{
|
||||
"ResourceType": "instance",
|
||||
"Tags": [
|
||||
{"Key": "Name", "Value": f"desineuron-{profile.name}-{int(utcnow().timestamp())}"},
|
||||
{"Key": "ManagedBy", "Value": "DesineuronOps"},
|
||||
{"Key": "DesineuronProfile", "Value": profile.name},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
if lifecycle == "spot":
|
||||
base_run_args["InstanceMarketOptions"] = {
|
||||
"MarketType": "spot",
|
||||
"SpotOptions": {"SpotInstanceType": "one-time", "InstanceInterruptionBehavior": "terminate"},
|
||||
}
|
||||
subnet_ids = candidate_subnet_ids(profile.region, launch_config["subnet_id"]) or [launch_config["subnet_id"]]
|
||||
last_exc: Exception | None = None
|
||||
response = None
|
||||
chosen_subnet = launch_config["subnet_id"]
|
||||
for subnet_id in subnet_ids:
|
||||
run_args = dict(base_run_args)
|
||||
run_args["SubnetId"] = subnet_id
|
||||
try:
|
||||
response = ec2.run_instances(**run_args)
|
||||
chosen_subnet = subnet_id
|
||||
break
|
||||
except ClientError as exc:
|
||||
last_exc = exc
|
||||
error_code = exc.response.get("Error", {}).get("Code")
|
||||
if error_code not in {"InsufficientInstanceCapacity", "MaxSpotInstanceCountExceeded", "Unsupported"}:
|
||||
raise
|
||||
continue
|
||||
if response is None:
|
||||
assert last_exc is not None
|
||||
raise last_exc
|
||||
instance = response["Instances"][0]
|
||||
machine = Machine(
|
||||
aws_instance_id=instance["InstanceId"],
|
||||
name=f"desineuron-{profile.name}",
|
||||
region=profile.region,
|
||||
profile_name=profile.name,
|
||||
instance_type=profile.instance_type,
|
||||
lifecycle=lifecycle,
|
||||
state=instance["State"]["Name"],
|
||||
public_ip=instance.get("PublicIpAddress"),
|
||||
private_ip=instance.get("PrivateIpAddress"),
|
||||
launch_time=instance.get("LaunchTime"),
|
||||
volume_gb=int(launch_config.get("root_volume_gb", settings.gpu_root_volume_gb)),
|
||||
public_ipv4_attached=True,
|
||||
details={"launched_by": actor, "chosen_subnet_id": chosen_subnet},
|
||||
)
|
||||
db.add(machine)
|
||||
db.flush()
|
||||
session_row = RuntimeSession(machine_id=machine.id, actor=actor, workload_name=profile.name, status="active")
|
||||
db.add(session_row)
|
||||
db.add(AuditEvent(actor=actor, action="launch_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={"profile": profile.name, "lifecycle": lifecycle}))
|
||||
return session_row
|
||||
|
||||
|
||||
def stop_machine(db: Session, machine: Machine, actor: str) -> None:
|
||||
ec2 = ec2_client(machine.region)
|
||||
ec2.stop_instances(InstanceIds=[machine.aws_instance_id])
|
||||
machine.state = "stopping"
|
||||
db.add(AuditEvent(actor=actor, action="stop_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={}))
|
||||
|
||||
|
||||
def terminate_machine(db: Session, machine: Machine, actor: str) -> None:
|
||||
ec2 = ec2_client(machine.region)
|
||||
ec2.terminate_instances(InstanceIds=[machine.aws_instance_id])
|
||||
machine.state = "shutting-down"
|
||||
db.add(AuditEvent(actor=actor, action="terminate_machine", entity_type="machine", entity_id=machine.aws_instance_id, payload={}))
|
||||
|
||||
|
||||
def ssh_run(host: str, user: str, command: str) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
[
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-o",
|
||||
"UserKnownHostsFile=NUL",
|
||||
"-i",
|
||||
str(settings.ssh_key_path),
|
||||
f"{user}@{host}",
|
||||
command,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
|
||||
def hydrate_model(machine: Machine, model_prefix: str, actor: str, bucket_name: str) -> dict:
|
||||
if not machine.public_ip:
|
||||
raise RuntimeError("Machine has no public IP for hydration")
|
||||
install_cmd = (
|
||||
"command -v s5cmd >/dev/null 2>&1 || "
|
||||
"curl -L https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz "
|
||||
"| tar -xz -C /tmp && sudo mv /tmp/s5cmd /usr/local/bin/s5cmd"
|
||||
)
|
||||
ssh_run(machine.public_ip, settings.gpu_ssh_user, install_cmd)
|
||||
remote_dir = f"/opt/dlami/nvme/models/{model_prefix.split('/')[-2]}"
|
||||
copy_cmd = (
|
||||
f"mkdir -p {remote_dir} && "
|
||||
f"s5cmd cp 's3://{bucket_name}/{model_prefix}*' '{remote_dir}/'"
|
||||
)
|
||||
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, copy_cmd)
|
||||
verify_result = None
|
||||
manifest_key = f"manifests/models/{model_prefix.rstrip('/').split('/')[-1]}.json"
|
||||
try:
|
||||
manifest_obj = s3_client().get_object(Bucket=bucket_name, Key=manifest_key)
|
||||
manifest = json.loads(manifest_obj["Body"].read().decode("utf-8"))
|
||||
checks = " && ".join(
|
||||
f"test -f {shlex.quote(remote_dir + '/' + entry['path'])}"
|
||||
for entry in manifest.get("files", [])
|
||||
) or "true"
|
||||
verify = ssh_run(machine.public_ip, settings.gpu_ssh_user, checks)
|
||||
verify_result = {"stdout": verify.stdout, "stderr": verify.stderr, "returncode": verify.returncode}
|
||||
except ClientError:
|
||||
verify_result = {"stdout": "", "stderr": "manifest_missing", "returncode": 1}
|
||||
return {
|
||||
"stdout": result.stdout,
|
||||
"stderr": result.stderr,
|
||||
"returncode": result.returncode,
|
||||
"remote_dir": remote_dir,
|
||||
"verify": verify_result,
|
||||
}
|
||||
|
||||
|
||||
def start_service(machine: Machine, service_name: str) -> dict:
|
||||
if not machine.public_ip:
|
||||
raise RuntimeError("Machine has no public IP")
|
||||
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, f"sudo systemctl start {service_name} && sudo systemctl is-active {service_name}")
|
||||
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
|
||||
|
||||
|
||||
def stop_service(machine: Machine, service_name: str) -> dict:
|
||||
if not machine.public_ip:
|
||||
raise RuntimeError("Machine has no public IP")
|
||||
result = ssh_run(machine.public_ip, settings.gpu_ssh_user, f"sudo systemctl stop {service_name}")
|
||||
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
|
||||
|
||||
|
||||
def export_sessions_csv(db: Session, target_path: str) -> str:
|
||||
rows = db.execute(
|
||||
select(
|
||||
RuntimeSession.id,
|
||||
RuntimeSession.actor,
|
||||
RuntimeSession.workload_name,
|
||||
RuntimeSession.status,
|
||||
RuntimeSession.started_at,
|
||||
RuntimeSession.ended_at,
|
||||
SessionCost.runtime_hours,
|
||||
SessionCost.compute_cost_usd,
|
||||
SessionCost.storage_cost_usd,
|
||||
SessionCost.public_ip_cost_usd,
|
||||
SessionCost.total_cost_usd,
|
||||
).join(SessionCost, SessionCost.session_id == RuntimeSession.id, isouter=True)
|
||||
)
|
||||
with open(target_path, "w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.writer(handle)
|
||||
writer.writerow(["session_id", "actor", "workload", "status", "started_at", "ended_at", "runtime_hours", "compute_cost_usd", "storage_cost_usd", "public_ip_cost_usd", "total_cost_usd"])
|
||||
for row in rows:
|
||||
writer.writerow(row)
|
||||
return target_path
|
||||
@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
from sqlalchemy import select
|
||||
|
||||
from .aws_control import calculate_machine_cost, create_managed_instance, export_sessions_csv, latest_market_price, stop_machine, terminate_machine
|
||||
from .database import Base, engine, session_scope
|
||||
from .models import AuditEvent, Machine, MachineProfile, Session as RuntimeSession
|
||||
|
||||
|
||||
app = typer.Typer(help="Desineuron Ops CLI")
|
||||
|
||||
|
||||
@app.command("machine-list")
|
||||
def machine_list():
|
||||
with session_scope() as db:
|
||||
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
|
||||
for machine in machines:
|
||||
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
|
||||
cost = calculate_machine_cost(machine, hourly_rate)
|
||||
typer.echo(f"{machine.aws_instance_id} {machine.instance_type} {machine.state} ${cost['total_cost_usd']:.4f}")
|
||||
|
||||
|
||||
@app.command("machine-launch")
|
||||
def machine_launch(profile_name: str, lifecycle: str = "spot", actor: str = "cli"):
|
||||
with session_scope() as db:
|
||||
profile = db.scalar(select(MachineProfile).where(MachineProfile.name == profile_name))
|
||||
if not profile:
|
||||
raise typer.BadParameter(f"Unknown profile: {profile_name}")
|
||||
session_row = create_managed_instance(db, profile, actor, lifecycle)
|
||||
typer.echo(json.dumps({"session_id": session_row.id, "profile": profile_name, "lifecycle": lifecycle}))
|
||||
|
||||
|
||||
@app.command("machine-stop")
|
||||
def machine_stop(machine_id: str, actor: str = "cli"):
|
||||
with session_scope() as db:
|
||||
machine = db.scalar(select(Machine).where(Machine.aws_instance_id == machine_id))
|
||||
if not machine:
|
||||
raise typer.BadParameter(f"Unknown machine: {machine_id}")
|
||||
stop_machine(db, machine, actor)
|
||||
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
|
||||
if active_session:
|
||||
active_session.status = "stopped"
|
||||
typer.echo(json.dumps({"machine": machine_id, "status": "stopping"}))
|
||||
|
||||
|
||||
@app.command("machine-terminate")
|
||||
def machine_terminate(machine_id: str, actor: str = "cli"):
|
||||
with session_scope() as db:
|
||||
machine = db.scalar(select(Machine).where(Machine.aws_instance_id == machine_id))
|
||||
if not machine:
|
||||
raise typer.BadParameter(f"Unknown machine: {machine_id}")
|
||||
terminate_machine(db, machine, actor)
|
||||
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
|
||||
if active_session:
|
||||
active_session.status = "terminated"
|
||||
typer.echo(json.dumps({"machine": machine_id, "status": "terminating"}))
|
||||
|
||||
|
||||
@app.command("audit-tail")
|
||||
def audit_tail(limit: int = 20):
|
||||
with session_scope() as db:
|
||||
events = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(limit)).all()
|
||||
for event in events:
|
||||
typer.echo(json.dumps({"actor": event.actor, "action": event.action, "entity": event.entity_id, "created_at": event.created_at.isoformat()}))
|
||||
|
||||
|
||||
@app.command("export-sessions")
|
||||
def export_sessions(output: Path = Path("/app/exports/sessions_cli.csv")):
|
||||
with session_scope() as db:
|
||||
export_sessions_csv(db, str(output))
|
||||
typer.echo(str(output))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
@@ -0,0 +1,51 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import json
|
||||
from dataclasses import field
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
database_url: str = os.environ["OPS_DATABASE_URL"]
|
||||
session_secret: str = os.environ["OPS_SESSION_SECRET"]
|
||||
admin_username: str = os.environ.get("OPS_ADMIN_USERNAME", "sagnik")
|
||||
admin_password: str = os.environ["OPS_ADMIN_PASSWORD"]
|
||||
team_users_json: str = os.environ.get("OPS_TEAM_USERS_JSON", "[]")
|
||||
default_region: str = os.environ.get("OPS_DEFAULT_REGION", "us-east-1")
|
||||
visible_regions: tuple[str, ...] = tuple(
|
||||
region.strip() for region in os.environ.get("OPS_VISIBLE_REGIONS", "us-east-1").split(",") if region.strip()
|
||||
)
|
||||
bucket_name: str = os.environ.get("OPS_BUCKET_NAME", "")
|
||||
bucket_region: str = os.environ.get("OPS_BUCKET_REGION", "us-east-1")
|
||||
ssh_key_path: Path = Path(os.environ.get("OPS_SSH_KEY_PATH", "/app/state/desineuron-l4-node.pem"))
|
||||
gpu_ssh_user: str = os.environ.get("OPS_GPU_SSH_USER", "ubuntu")
|
||||
ingress_ssh_host: str = os.environ.get("OPS_INGRESS_SSH_HOST", "")
|
||||
ingress_ssh_user: str = os.environ.get("OPS_INGRESS_SSH_USER", "ec2-user")
|
||||
ingress_ssh_port: int = int(os.environ.get("OPS_INGRESS_SSH_PORT", "22"))
|
||||
ingress_route_helper: str = os.environ.get("OPS_INGRESS_ROUTE_HELPER", "/usr/local/bin/manage_desineuron_routes.py")
|
||||
public_base_url: str = os.environ.get("OPS_LINUX_PUBLIC_BASE_URL", "https://ops.desineuron.in")
|
||||
ebs_gp3_per_gb_month: float = float(os.environ.get("OPS_PRICE_EBS_GP3_PER_GB_MONTH", "0.08"))
|
||||
public_ipv4_per_hour: float = float(os.environ.get("OPS_PRICE_PUBLIC_IPV4_PER_HOUR", "0.005"))
|
||||
allowed_machine_ids: tuple[str, ...] = tuple(
|
||||
machine.strip() for machine in os.environ.get("OPS_ALLOWED_MACHINE_IDS", "").split(",") if machine.strip()
|
||||
)
|
||||
gpu_subnet_id: str = os.environ.get("OPS_GPU_SUBNET_ID", "")
|
||||
gpu_security_group_ids: tuple[str, ...] = tuple(
|
||||
group.strip() for group in os.environ.get("OPS_GPU_SECURITY_GROUP_IDS", "").split(",") if group.strip()
|
||||
)
|
||||
gpu_key_name: str = os.environ.get("OPS_GPU_KEY_NAME", "")
|
||||
gpu_ami_id: str = os.environ.get("OPS_GPU_AMI_ID", "")
|
||||
gpu_instance_profile: str = os.environ.get("OPS_GPU_INSTANCE_PROFILE", "")
|
||||
gpu_root_volume_gb: int = int(os.environ.get("OPS_GPU_ROOT_VOLUME_GB", "300"))
|
||||
export_dir: Path = Path(os.environ.get("OPS_CSV_EXPORT_DIR", "/app/exports"))
|
||||
log_dir: Path = Path(os.environ.get("OPS_LOG_DIR", "/app/logs"))
|
||||
state_dir: Path = Path(os.environ.get("OPS_STATE_DIR", "/app/state"))
|
||||
model_library_root: Path = Path(os.environ.get("OPS_MODEL_LIBRARY_ROOT", "/model-library"))
|
||||
cloudflare_zone_name: str = os.environ.get("OPS_CLOUDFLARE_ZONE_NAME", "desineuron.in")
|
||||
cloudflare_api_token: str = os.environ.get("OPS_CLOUDFLARE_API_TOKEN", "")
|
||||
|
||||
|
||||
settings = Settings()
|
||||
@@ -0,0 +1,41 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
|
||||
|
||||
from .config import settings
|
||||
|
||||
|
||||
engine = create_engine(settings.database_url, pool_pre_ping=True)
|
||||
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
db.commit()
|
||||
except Exception:
|
||||
db.rollback()
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def session_scope():
|
||||
session = SessionLocal()
|
||||
try:
|
||||
yield session
|
||||
session.commit()
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
598
infrastructure/ops_control_plane/app/ops_control_plane/main.py
Normal file
598
infrastructure/ops_control_plane/app/ops_control_plane/main.py
Normal file
@@ -0,0 +1,598 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
from fastapi import Depends, FastAPI, Form, HTTPException, Request
|
||||
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.orm import Session
|
||||
from starlette.middleware.sessions import SessionMiddleware
|
||||
|
||||
from .aws_control import calculate_machine_cost, create_managed_instance, ensure_bucket, export_sessions_csv, hydrate_model, latest_market_price, seed_bucket_prefixes, start_service, stop_machine, stop_service, sync_instances, terminate_machine, upload_model_directory
|
||||
from .config import settings
|
||||
from .database import Base, engine, get_db, session_scope
|
||||
from .models import AuditEvent, CsvExport, Job, Machine, MachineProfile, MarketSnapshot, ModelCatalog, RouteBinding, Session as RuntimeSession, SessionCost, User, WorkloadProfile
|
||||
from .route_control import apply_route, remove_route
|
||||
from .seed import seed_defaults
|
||||
from .security import get_current_user, verify_password
|
||||
|
||||
|
||||
app = FastAPI(title="Desineuron Ops Control Plane")
|
||||
app.add_middleware(SessionMiddleware, secret_key=settings.session_secret)
|
||||
template_dir = Path(__file__).parent / "templates"
|
||||
static_dir = Path(__file__).parent / "static"
|
||||
templates = Jinja2Templates(directory=str(template_dir))
|
||||
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
|
||||
|
||||
|
||||
def utcnow() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def recent_totals(db: Session) -> dict:
|
||||
now = utcnow()
|
||||
day_start = now - timedelta(days=1)
|
||||
month_start = now - timedelta(days=30)
|
||||
day_total = db.scalar(
|
||||
select(func.coalesce(func.sum(SessionCost.total_cost_usd), 0.0))
|
||||
.join(RuntimeSession, RuntimeSession.id == SessionCost.session_id)
|
||||
.where(SessionCost.calculated_at >= day_start)
|
||||
)
|
||||
month_total = db.scalar(
|
||||
select(func.coalesce(func.sum(SessionCost.total_cost_usd), 0.0))
|
||||
.join(RuntimeSession, RuntimeSession.id == SessionCost.session_id)
|
||||
.where(SessionCost.calculated_at >= month_start)
|
||||
)
|
||||
return {
|
||||
"last_24h_usd": round(float(day_total or 0.0), 4),
|
||||
"last_30d_usd": round(float(month_total or 0.0), 4),
|
||||
}
|
||||
|
||||
|
||||
def pop_flash(request: Request) -> dict | None:
|
||||
return request.session.pop("flash", None)
|
||||
|
||||
|
||||
def set_flash(request: Request, level: str, message: str) -> None:
|
||||
request.session["flash"] = {"level": level, "message": message}
|
||||
|
||||
|
||||
def parse_tag_list(raw: str) -> list[str]:
|
||||
return [item.strip() for item in raw.split(",") if item.strip()]
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup() -> None:
|
||||
Base.metadata.create_all(bind=engine)
|
||||
settings.export_dir.mkdir(parents=True, exist_ok=True)
|
||||
settings.log_dir.mkdir(parents=True, exist_ok=True)
|
||||
settings.state_dir.mkdir(parents=True, exist_ok=True)
|
||||
with session_scope() as db:
|
||||
seed_defaults(db)
|
||||
if settings.bucket_name:
|
||||
ensure_bucket(settings.bucket_name, settings.bucket_region)
|
||||
seed_bucket_prefixes(settings.bucket_name)
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
def root(request: Request):
|
||||
if request.session.get("username"):
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return RedirectResponse("/login", status_code=302)
|
||||
|
||||
|
||||
@app.get("/login", response_class=HTMLResponse)
|
||||
def login_page(request: Request):
|
||||
return templates.TemplateResponse("login.html", {"request": request, "error": None})
|
||||
|
||||
|
||||
@app.post("/login", response_class=HTMLResponse)
|
||||
def login(request: Request, username: str = Form(...), password: str = Form(...), db: Session = Depends(get_db)):
|
||||
user = db.scalar(select(User).where(User.username == username, User.is_active.is_(True)))
|
||||
if not user or not verify_password(password, user.password_hash):
|
||||
return templates.TemplateResponse("login.html", {"request": request, "error": "Invalid credentials"}, status_code=401)
|
||||
request.session["username"] = user.username
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
|
||||
|
||||
@app.get("/logout")
|
||||
def logout(request: Request):
|
||||
request.session.clear()
|
||||
return RedirectResponse("/login", status_code=302)
|
||||
|
||||
|
||||
@app.get("/dashboard", response_class=HTMLResponse)
|
||||
def dashboard(request: Request, current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
|
||||
profiles = db.scalars(select(MachineProfile).order_by(MachineProfile.name)).all()
|
||||
workloads = db.scalars(select(WorkloadProfile).order_by(WorkloadProfile.name)).all()
|
||||
models = db.scalars(select(ModelCatalog).order_by(ModelCatalog.model_key)).all()
|
||||
routes = db.scalars(select(RouteBinding).order_by(RouteBinding.hostname)).all()
|
||||
jobs = db.scalars(select(Job).order_by(Job.created_at.desc()).limit(20)).all()
|
||||
sessions = db.scalars(select(RuntimeSession).order_by(RuntimeSession.started_at.desc()).limit(20)).all()
|
||||
market_rows = db.scalars(select(MarketSnapshot).order_by(MarketSnapshot.observed_at.desc()).limit(100)).all()
|
||||
audits = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(20)).all()
|
||||
costs = []
|
||||
total_hourly = 0.0
|
||||
total_estimated = 0.0
|
||||
for machine in machines:
|
||||
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
|
||||
machine_cost = calculate_machine_cost(machine, hourly_rate)
|
||||
total_hourly += machine_cost["hourly_price_usd"]
|
||||
total_estimated += machine_cost["total_cost_usd"]
|
||||
costs.append((machine.aws_instance_id, machine_cost))
|
||||
summary = {
|
||||
"machine_count": len(machines),
|
||||
"active_sessions": sum(1 for session in sessions if session.status == "active"),
|
||||
"active_jobs": sum(1 for job in jobs if job.status in {"queued", "running"}),
|
||||
"routes_active": sum(1 for route in routes if route.status == "active"),
|
||||
"hourly_burn_usd": round(total_hourly, 4),
|
||||
"fleet_estimated_cost_usd": round(total_estimated, 4),
|
||||
**recent_totals(db),
|
||||
}
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
{
|
||||
"request": request,
|
||||
"user": current_user,
|
||||
"machines": machines,
|
||||
"profiles": profiles,
|
||||
"workloads": workloads,
|
||||
"models": models,
|
||||
"routes": routes,
|
||||
"jobs": jobs,
|
||||
"sessions": sessions,
|
||||
"market_rows": market_rows,
|
||||
"audits": audits,
|
||||
"costs": dict(costs),
|
||||
"summary": summary,
|
||||
"flash": pop_flash(request),
|
||||
"bucket_name": settings.bucket_name,
|
||||
"regions": settings.visible_regions,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/markets/instances")
|
||||
def get_markets(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
profiles = db.scalars(select(MachineProfile).order_by(MachineProfile.name)).all()
|
||||
payload = []
|
||||
for profile in profiles:
|
||||
per_region = {}
|
||||
for region in settings.visible_regions:
|
||||
on_demand = db.scalar(
|
||||
select(MarketSnapshot)
|
||||
.where(MarketSnapshot.region == region, MarketSnapshot.instance_type == profile.instance_type, MarketSnapshot.lifecycle == "on-demand")
|
||||
.order_by(MarketSnapshot.observed_at.desc())
|
||||
)
|
||||
spot = db.scalar(
|
||||
select(MarketSnapshot)
|
||||
.where(MarketSnapshot.region == region, MarketSnapshot.instance_type == profile.instance_type, MarketSnapshot.lifecycle == "spot")
|
||||
.order_by(MarketSnapshot.observed_at.desc())
|
||||
)
|
||||
per_region[region] = {
|
||||
"on_demand": on_demand.hourly_price_usd if on_demand else None,
|
||||
"on_demand_available": bool(on_demand and on_demand.offering_available),
|
||||
"spot": spot.hourly_price_usd if spot else None,
|
||||
"spot_available": bool(spot and spot.offering_available),
|
||||
"last_seen": max(
|
||||
[stamp for stamp in [on_demand.observed_at if on_demand else None, spot.observed_at if spot else None] if stamp],
|
||||
default=None,
|
||||
),
|
||||
}
|
||||
payload.append(
|
||||
{
|
||||
"profile": profile.name,
|
||||
"instance_type": profile.instance_type,
|
||||
"gpu_label": profile.gpu_label,
|
||||
"vcpu": profile.vcpu,
|
||||
"memory_gib": profile.memory_gib,
|
||||
"regions": per_region,
|
||||
}
|
||||
)
|
||||
return payload
|
||||
|
||||
|
||||
@app.get("/api/machines")
|
||||
def get_machines(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
machines = db.scalars(select(Machine).order_by(Machine.updated_at.desc())).all()
|
||||
payload = []
|
||||
for machine in machines:
|
||||
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
|
||||
payload.append(
|
||||
{
|
||||
"id": machine.id,
|
||||
"aws_instance_id": machine.aws_instance_id,
|
||||
"name": machine.name,
|
||||
"region": machine.region,
|
||||
"state": machine.state,
|
||||
"instance_type": machine.instance_type,
|
||||
"lifecycle": machine.lifecycle,
|
||||
"public_ip": machine.public_ip,
|
||||
"private_ip": machine.private_ip,
|
||||
"cost": calculate_machine_cost(machine, hourly_rate),
|
||||
}
|
||||
)
|
||||
return payload
|
||||
|
||||
|
||||
@app.post("/api/machines/launch")
|
||||
def launch_machine(request: Request, profile_name: str = Form(...), lifecycle: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
profile = db.scalar(select(MachineProfile).where(MachineProfile.name == profile_name))
|
||||
if not profile:
|
||||
raise HTTPException(status_code=404, detail="Profile not found")
|
||||
job = Job(job_type="launch_machine", status="running", actor=current_user.username, payload={"profile_name": profile_name, "lifecycle": lifecycle}, started_at=utcnow())
|
||||
db.add(job)
|
||||
db.flush()
|
||||
try:
|
||||
session_row = create_managed_instance(db, profile, current_user.username, lifecycle)
|
||||
except Exception as exc:
|
||||
error_code = exc.response.get("Error", {}).get("Code") if isinstance(exc, ClientError) else exc.__class__.__name__
|
||||
job.status = "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"error": str(exc), "code": error_code}
|
||||
db.add(AuditEvent(actor=current_user.username, action="launch_machine_failed", entity_type="profile", entity_id=profile.name, payload=job.result))
|
||||
set_flash(request, "error", f"Launch failed for {profile.name}: {error_code}")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
job.status = "completed"
|
||||
job.session_id = session_row.id
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"session_id": session_row.id}
|
||||
set_flash(request, "success", f"Launched {profile.name} as {lifecycle}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
|
||||
|
||||
@app.post("/api/machines/{machine_id}/stop")
|
||||
def api_stop_machine(machine_id: int, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
machine = db.get(Machine, machine_id)
|
||||
if not machine:
|
||||
raise HTTPException(status_code=404, detail="Machine not found")
|
||||
job = Job(job_type="stop_machine", status="running", actor=current_user.username, machine_id=machine_id, payload={"aws_instance_id": machine.aws_instance_id}, started_at=utcnow())
|
||||
db.add(job)
|
||||
stop_machine(db, machine, current_user.username)
|
||||
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
|
||||
if active_session:
|
||||
active_session.status = "stopped"
|
||||
active_session.ended_at = utcnow()
|
||||
job.status = "completed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"status": "stopping"}
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success", f"Stop requested for {machine.aws_instance_id}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return {"status": "stopping"}
|
||||
|
||||
|
||||
@app.post("/api/machines/{machine_id}/terminate")
|
||||
def api_terminate_machine(machine_id: int, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
machine = db.get(Machine, machine_id)
|
||||
if not machine:
|
||||
raise HTTPException(status_code=404, detail="Machine not found")
|
||||
job = Job(job_type="terminate_machine", status="running", actor=current_user.username, machine_id=machine_id, payload={"aws_instance_id": machine.aws_instance_id}, started_at=utcnow())
|
||||
db.add(job)
|
||||
terminate_machine(db, machine, current_user.username)
|
||||
active_session = db.scalar(select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active"))
|
||||
if active_session:
|
||||
active_session.status = "terminated"
|
||||
active_session.ended_at = utcnow()
|
||||
job.status = "completed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"status": "terminating"}
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success", f"Terminate requested for {machine.aws_instance_id}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return {"status": "terminating"}
|
||||
|
||||
|
||||
@app.post("/api/models/hydrate")
|
||||
def api_hydrate_model(request: Request, machine_id: int = Form(...), model_key: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
machine = db.get(Machine, machine_id)
|
||||
model = db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model_key))
|
||||
if not machine or not model:
|
||||
raise HTTPException(status_code=404, detail="Machine or model not found")
|
||||
if not settings.bucket_name:
|
||||
raise HTTPException(status_code=400, detail="Bucket is not configured")
|
||||
job = Job(job_type="hydrate_model", status="running", actor=current_user.username, machine_id=machine_id, payload={"model_key": model_key}, started_at=utcnow())
|
||||
db.add(job)
|
||||
result = hydrate_model(machine, model.s3_prefix, current_user.username, settings.bucket_name)
|
||||
db.add(AuditEvent(actor=current_user.username, action="hydrate_model", entity_type="machine", entity_id=machine.aws_instance_id, payload={"model_key": model.model_key, "result": result}))
|
||||
job.status = "completed" if result.get("returncode") == 0 else "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = result
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Hydration {'completed' if result.get('returncode') == 0 else 'failed'} for {model.label} on {machine.aws_instance_id}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse(result)
|
||||
|
||||
|
||||
@app.post("/api/models/register")
|
||||
def api_register_model(
|
||||
request: Request,
|
||||
model_key: str = Form(...),
|
||||
label: str = Form(...),
|
||||
source_relative_path: str = Form(...),
|
||||
workload_tags: str = Form(""),
|
||||
compatibility_tags: str = Form(""),
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
if not settings.bucket_name:
|
||||
raise HTTPException(status_code=400, detail="Bucket is not configured")
|
||||
job = Job(
|
||||
job_type="register_model",
|
||||
status="running",
|
||||
actor=current_user.username,
|
||||
payload={
|
||||
"model_key": model_key,
|
||||
"label": label,
|
||||
"source_relative_path": source_relative_path,
|
||||
"workload_tags": workload_tags,
|
||||
"compatibility_tags": compatibility_tags,
|
||||
},
|
||||
started_at=utcnow(),
|
||||
)
|
||||
db.add(job)
|
||||
try:
|
||||
result = upload_model_directory(
|
||||
settings.bucket_name,
|
||||
model_key=model_key,
|
||||
source_relative_path=source_relative_path,
|
||||
label=label,
|
||||
workload_tags=parse_tag_list(workload_tags),
|
||||
compatibility_tags=parse_tag_list(compatibility_tags),
|
||||
)
|
||||
except Exception as exc:
|
||||
job.status = "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"error": str(exc)}
|
||||
db.add(AuditEvent(actor=current_user.username, action="register_model_failed", entity_type="model", entity_id=model_key, payload=job.result))
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "error", f"Model ingest failed for {model_key}: {exc}")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
existing = db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model_key))
|
||||
if existing:
|
||||
existing.label = label
|
||||
existing.s3_prefix = result["s3_prefix"]
|
||||
existing.expected_manifest = result["manifest"]
|
||||
existing.checksums = {entry["path"]: entry["sha256"] for entry in result["manifest"]["files"]}
|
||||
existing.compatibility_tags = result["compatibility_tags"]
|
||||
existing.workload_tags = result["workload_tags"]
|
||||
existing.size_gb = round(result["manifest"]["total_size_bytes"] / (1024 ** 3), 3)
|
||||
else:
|
||||
db.add(
|
||||
ModelCatalog(
|
||||
model_key=model_key,
|
||||
label=label,
|
||||
s3_prefix=result["s3_prefix"],
|
||||
expected_manifest=result["manifest"],
|
||||
checksums={entry["path"]: entry["sha256"] for entry in result["manifest"]["files"]},
|
||||
compatibility_tags=result["compatibility_tags"],
|
||||
workload_tags=result["workload_tags"],
|
||||
size_gb=round(result["manifest"]["total_size_bytes"] / (1024 ** 3), 3),
|
||||
)
|
||||
)
|
||||
job.status = "completed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"manifest_key": result["manifest_key"], "file_count": result["manifest"]["file_count"]}
|
||||
db.add(AuditEvent(actor=current_user.username, action="register_model", entity_type="model", entity_id=model_key, payload=job.result))
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success", f"Model {model_key} uploaded to S3 and manifest stored.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse(job.result)
|
||||
|
||||
|
||||
@app.post("/api/workloads/start")
|
||||
def api_start_workload(request: Request, machine_id: int = Form(...), workload_name: str = Form(...), auto_route: bool = Form(False), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
machine = db.get(Machine, machine_id)
|
||||
workload = db.scalar(select(WorkloadProfile).where(WorkloadProfile.name == workload_name))
|
||||
if not machine or not workload:
|
||||
raise HTTPException(status_code=404, detail="Machine or workload not found")
|
||||
job = Job(job_type="start_workload", status="running", actor=current_user.username, machine_id=machine_id, payload={"workload_name": workload_name, "auto_route": auto_route}, started_at=utcnow())
|
||||
db.add(job)
|
||||
result = start_service(machine, workload.name)
|
||||
route_result = None
|
||||
if result.get("returncode") == 0 and auto_route and workload.route_hostname and workload.default_port and machine.private_ip:
|
||||
route_result = apply_route(workload.route_hostname, "http", machine.private_ip, workload.default_port)
|
||||
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == workload.route_hostname))
|
||||
if existing:
|
||||
existing.scheme = "http"
|
||||
existing.target_host = machine.private_ip
|
||||
existing.target_port = workload.default_port
|
||||
existing.status = "active"
|
||||
existing.details = {"managed_by": "ops_control_plane", "machine_id": machine.aws_instance_id}
|
||||
else:
|
||||
db.add(RouteBinding(hostname=workload.route_hostname, target_type="managed", target_host=machine.private_ip, target_port=workload.default_port, scheme="http", status="active", details={"managed_by": "ops_control_plane", "machine_id": machine.aws_instance_id}))
|
||||
db.add(AuditEvent(actor=current_user.username, action="start_workload", entity_type="machine", entity_id=machine.aws_instance_id, payload={"workload": workload.name, "result": result}))
|
||||
job.status = "completed" if result.get("returncode") == 0 else "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = {"service": result, "route": route_result}
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Start workload {'completed' if result.get('returncode') == 0 else 'failed'} for {workload.name} on {machine.aws_instance_id}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse({"service": result, "route": route_result})
|
||||
|
||||
|
||||
@app.post("/api/workloads/{machine_id}/stop")
|
||||
def api_stop_workload(machine_id: int, request: Request, workload_name: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
machine = db.get(Machine, machine_id)
|
||||
if not machine:
|
||||
raise HTTPException(status_code=404, detail="Machine not found")
|
||||
job = Job(job_type="stop_workload", status="running", actor=current_user.username, machine_id=machine_id, payload={"workload_name": workload_name}, started_at=utcnow())
|
||||
db.add(job)
|
||||
result = stop_service(machine, workload_name)
|
||||
db.add(AuditEvent(actor=current_user.username, action="stop_workload", entity_type="machine", entity_id=machine.aws_instance_id, payload={"workload": workload_name, "result": result}))
|
||||
job.status = "completed" if result.get("returncode") == 0 else "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = result
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Stop workload {'completed' if result.get('returncode') == 0 else 'failed'} for {workload_name} on {machine.aws_instance_id}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse(result)
|
||||
|
||||
|
||||
@app.post("/api/routes/map")
|
||||
def api_map_route(request: Request, hostname: str = Form(...), scheme: str = Form(...), target_host: str = Form(...), target_port: int = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
job = Job(job_type="map_route", status="running", actor=current_user.username, payload={"hostname": hostname, "scheme": scheme, "target_host": target_host, "target_port": target_port}, started_at=utcnow())
|
||||
db.add(job)
|
||||
result = apply_route(hostname, scheme, target_host, target_port)
|
||||
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == hostname))
|
||||
if existing:
|
||||
existing.scheme = scheme
|
||||
existing.target_host = target_host
|
||||
existing.target_port = target_port
|
||||
existing.status = "active"
|
||||
else:
|
||||
db.add(RouteBinding(hostname=hostname, target_type="managed", target_host=target_host, target_port=target_port, scheme=scheme, status="active"))
|
||||
db.add(AuditEvent(actor=current_user.username, action="map_route", entity_type="route", entity_id=hostname, payload=result))
|
||||
job.status = "completed" if result.get("returncode") == 0 else "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = result
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Route {'mapped' if result.get('returncode') == 0 else 'map failed'} for {hostname}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse(result)
|
||||
|
||||
|
||||
@app.post("/api/routes/unmap")
|
||||
def api_unmap_route(request: Request, hostname: str = Form(...), db: Session = Depends(get_db), current_user: User = Depends(get_current_user)):
|
||||
job = Job(job_type="unmap_route", status="running", actor=current_user.username, payload={"hostname": hostname}, started_at=utcnow())
|
||||
db.add(job)
|
||||
result = remove_route(hostname)
|
||||
existing = db.scalar(select(RouteBinding).where(RouteBinding.hostname == hostname))
|
||||
if existing:
|
||||
existing.status = "removed"
|
||||
db.add(AuditEvent(actor=current_user.username, action="unmap_route", entity_type="route", entity_id=hostname, payload=result))
|
||||
job.status = "completed" if result.get("returncode") == 0 else "failed"
|
||||
job.finished_at = utcnow()
|
||||
job.result = result
|
||||
if "text/html" in request.headers.get("accept", ""):
|
||||
set_flash(request, "success" if result.get("returncode") == 0 else "error", f"Route {'removed' if result.get('returncode') == 0 else 'removal failed'} for {hostname}.")
|
||||
return RedirectResponse("/dashboard", status_code=302)
|
||||
return JSONResponse(result)
|
||||
|
||||
|
||||
@app.get("/api/markets/pricing")
|
||||
def get_market_pricing(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
rows = db.scalars(select(MarketSnapshot).order_by(MarketSnapshot.observed_at.desc()).limit(200)).all()
|
||||
return [
|
||||
{
|
||||
"region": row.region,
|
||||
"instance_type": row.instance_type,
|
||||
"lifecycle": row.lifecycle,
|
||||
"offering_available": row.offering_available,
|
||||
"hourly_price_usd": row.hourly_price_usd,
|
||||
"observed_at": row.observed_at,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
@app.get("/api/sessions")
|
||||
def get_sessions(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
sessions = db.scalars(select(RuntimeSession).order_by(RuntimeSession.started_at.desc()).limit(200)).all()
|
||||
payload = []
|
||||
for session_row in sessions:
|
||||
machine = db.get(Machine, session_row.machine_id) if session_row.machine_id else None
|
||||
latest_cost = db.scalar(select(SessionCost).where(SessionCost.session_id == session_row.id).order_by(SessionCost.calculated_at.desc()))
|
||||
payload.append(
|
||||
{
|
||||
"id": session_row.id,
|
||||
"actor": session_row.actor,
|
||||
"workload_name": session_row.workload_name,
|
||||
"status": session_row.status,
|
||||
"started_at": session_row.started_at,
|
||||
"ended_at": session_row.ended_at,
|
||||
"notes": session_row.notes,
|
||||
"machine": machine.aws_instance_id if machine else None,
|
||||
"cost": latest_cost.total_cost_usd if latest_cost else None,
|
||||
"runtime_hours": latest_cost.runtime_hours if latest_cost else None,
|
||||
}
|
||||
)
|
||||
return payload
|
||||
|
||||
|
||||
@app.get("/api/costs")
|
||||
def api_costs(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
machines = db.scalars(select(Machine)).all()
|
||||
total = 0.0
|
||||
items = []
|
||||
for machine in machines:
|
||||
hourly_rate = latest_market_price(db, machine.region, machine.instance_type, machine.lifecycle)
|
||||
cost = calculate_machine_cost(machine, hourly_rate)
|
||||
total += cost["total_cost_usd"]
|
||||
items.append({"machine": machine.aws_instance_id, **cost})
|
||||
return {"machines": items, "total_estimated_cost_usd": round(total, 4), **recent_totals(db)}
|
||||
|
||||
|
||||
@app.get("/api/models")
|
||||
def api_models(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
models = db.scalars(select(ModelCatalog).order_by(ModelCatalog.model_key)).all()
|
||||
return [
|
||||
{
|
||||
"model_key": model.model_key,
|
||||
"label": model.label,
|
||||
"s3_prefix": model.s3_prefix,
|
||||
"size_gb": model.size_gb,
|
||||
"workload_tags": model.workload_tags,
|
||||
"compatibility_tags": model.compatibility_tags,
|
||||
"file_count": (model.expected_manifest or {}).get("file_count", 0),
|
||||
}
|
||||
for model in models
|
||||
]
|
||||
|
||||
|
||||
@app.get("/api/audit")
|
||||
def api_audit(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
events = db.scalars(select(AuditEvent).order_by(AuditEvent.created_at.desc()).limit(100)).all()
|
||||
return [
|
||||
{
|
||||
"actor": event.actor,
|
||||
"action": event.action,
|
||||
"entity_type": event.entity_type,
|
||||
"entity_id": event.entity_id,
|
||||
"payload": event.payload,
|
||||
"created_at": event.created_at,
|
||||
}
|
||||
for event in events
|
||||
]
|
||||
|
||||
|
||||
@app.get("/api/jobs")
|
||||
def api_jobs(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
jobs = db.scalars(select(Job).order_by(Job.created_at.desc()).limit(200)).all()
|
||||
return [
|
||||
{
|
||||
"id": job.id,
|
||||
"job_type": job.job_type,
|
||||
"status": job.status,
|
||||
"actor": job.actor,
|
||||
"machine_id": job.machine_id,
|
||||
"session_id": job.session_id,
|
||||
"payload": job.payload,
|
||||
"result": job.result,
|
||||
"created_at": job.created_at,
|
||||
"finished_at": job.finished_at,
|
||||
}
|
||||
for job in jobs
|
||||
]
|
||||
|
||||
|
||||
@app.get("/api/exports/csv")
|
||||
def api_export_csv(current_user: User = Depends(get_current_user), db: Session = Depends(get_db)):
|
||||
target = settings.export_dir / "sessions_latest.csv"
|
||||
export_sessions_csv(db, str(target))
|
||||
db.add(CsvExport(actor=current_user.username, export_type="sessions", path=str(target), details={"format": "csv"}))
|
||||
return {"path": str(target)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("ops_control_plane.main:app", host="0.0.0.0", port=8080, reload=False)
|
||||
192
infrastructure/ops_control_plane/app/ops_control_plane/models.py
Normal file
192
infrastructure/ops_control_plane/app/ops_control_plane/models.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from sqlalchemy import Boolean, DateTime, Float, ForeignKey, Integer, JSON, String, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from .database import Base
|
||||
|
||||
|
||||
def utcnow() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
class User(Base):
|
||||
__tablename__ = "users"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
username: Mapped[str] = mapped_column(String(64), unique=True, index=True)
|
||||
password_hash: Mapped[str] = mapped_column(String(255))
|
||||
role: Mapped[str] = mapped_column(String(32), default="admin")
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
|
||||
|
||||
class MachineProfile(Base):
|
||||
__tablename__ = "machine_profiles"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(64), unique=True)
|
||||
region: Mapped[str] = mapped_column(String(32))
|
||||
instance_type: Mapped[str] = mapped_column(String(32))
|
||||
gpu_label: Mapped[str] = mapped_column(String(64))
|
||||
vcpu: Mapped[int] = mapped_column(Integer)
|
||||
memory_gib: Mapped[float] = mapped_column(Float)
|
||||
preferred_lifecycle: Mapped[str] = mapped_column(String(16), default="spot")
|
||||
launch_config: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
intended_workloads: Mapped[list] = mapped_column(JSON, default=list)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
|
||||
|
||||
class MarketSnapshot(Base):
|
||||
__tablename__ = "market_snapshots"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
region: Mapped[str] = mapped_column(String(32), index=True)
|
||||
instance_type: Mapped[str] = mapped_column(String(32), index=True)
|
||||
lifecycle: Mapped[str] = mapped_column(String(16), index=True)
|
||||
offering_available: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
hourly_price_usd: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
source: Mapped[str] = mapped_column(String(32), default="aws")
|
||||
raw_payload: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
observed_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, index=True)
|
||||
|
||||
|
||||
class Machine(Base):
|
||||
__tablename__ = "machines"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
aws_instance_id: Mapped[str] = mapped_column(String(32), unique=True, index=True)
|
||||
name: Mapped[str] = mapped_column(String(128))
|
||||
region: Mapped[str] = mapped_column(String(32))
|
||||
profile_name: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
instance_type: Mapped[str] = mapped_column(String(32))
|
||||
lifecycle: Mapped[str] = mapped_column(String(16))
|
||||
state: Mapped[str] = mapped_column(String(32))
|
||||
public_ip: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
private_ip: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
launch_time: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
volume_gb: Mapped[int] = mapped_column(Integer, default=0)
|
||||
public_ipv4_attached: Mapped[bool] = mapped_column(Boolean, default=False)
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
|
||||
|
||||
|
||||
class WorkloadProfile(Base):
|
||||
__tablename__ = "workload_profiles"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(64), unique=True)
|
||||
service_type: Mapped[str] = mapped_column(String(32))
|
||||
model_requirements: Mapped[list] = mapped_column(JSON, default=list)
|
||||
default_port: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
start_command: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
stop_command: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
healthcheck_path: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
route_hostname: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
||||
|
||||
class Job(Base):
|
||||
__tablename__ = "jobs"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
job_type: Mapped[str] = mapped_column(String(32), index=True)
|
||||
status: Mapped[str] = mapped_column(String(32), index=True, default="queued")
|
||||
payload: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
result: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
|
||||
session_id: Mapped[int | None] = mapped_column(ForeignKey("sessions.id"), nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
|
||||
|
||||
class Session(Base):
|
||||
__tablename__ = "sessions"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
|
||||
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
workload_name: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(32), default="active")
|
||||
started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
ended_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
cost_records: Mapped[list["SessionCost"]] = relationship(back_populates="session")
|
||||
|
||||
|
||||
class SessionCost(Base):
|
||||
__tablename__ = "session_costs"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
session_id: Mapped[int] = mapped_column(ForeignKey("sessions.id"))
|
||||
runtime_hours: Mapped[float] = mapped_column(Float, default=0.0)
|
||||
compute_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
|
||||
storage_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
|
||||
public_ip_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
|
||||
total_cost_usd: Mapped[float] = mapped_column(Float, default=0.0)
|
||||
calculated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
session: Mapped[Session] = relationship(back_populates="cost_records")
|
||||
|
||||
|
||||
class ModelCatalog(Base):
|
||||
__tablename__ = "model_catalog"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
model_key: Mapped[str] = mapped_column(String(128), unique=True)
|
||||
label: Mapped[str] = mapped_column(String(255))
|
||||
s3_prefix: Mapped[str] = mapped_column(String(512))
|
||||
expected_manifest: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
checksums: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
compatibility_tags: Mapped[list] = mapped_column(JSON, default=list)
|
||||
workload_tags: Mapped[list] = mapped_column(JSON, default=list)
|
||||
size_gb: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
|
||||
|
||||
class MachineModelCache(Base):
|
||||
__tablename__ = "machine_model_cache"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
machine_id: Mapped[int] = mapped_column(ForeignKey("machines.id"))
|
||||
model_key: Mapped[str] = mapped_column(String(128))
|
||||
status: Mapped[str] = mapped_column(String(32), default="pending")
|
||||
path_on_machine: Mapped[str | None] = mapped_column(String(512), nullable=True)
|
||||
hydrated_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
|
||||
|
||||
class RouteBinding(Base):
|
||||
__tablename__ = "route_bindings"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
hostname: Mapped[str] = mapped_column(String(255), unique=True)
|
||||
target_type: Mapped[str] = mapped_column(String(32))
|
||||
target_host: Mapped[str] = mapped_column(String(255))
|
||||
target_port: Mapped[int] = mapped_column(Integer)
|
||||
scheme: Mapped[str] = mapped_column(String(16), default="http")
|
||||
status: Mapped[str] = mapped_column(String(32), default="active")
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
|
||||
|
||||
|
||||
class ServiceState(Base):
|
||||
__tablename__ = "service_states"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
machine_id: Mapped[int | None] = mapped_column(ForeignKey("machines.id"), nullable=True)
|
||||
service_name: Mapped[str] = mapped_column(String(64))
|
||||
status: Mapped[str] = mapped_column(String(32))
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow, onupdate=utcnow)
|
||||
|
||||
|
||||
class AuditEvent(Base):
|
||||
__tablename__ = "audit_events"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
action: Mapped[str] = mapped_column(String(64))
|
||||
entity_type: Mapped[str] = mapped_column(String(64))
|
||||
entity_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
||||
payload: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
|
||||
|
||||
class CsvExport(Base):
|
||||
__tablename__ = "csv_exports"
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||
actor: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
export_type: Mapped[str] = mapped_column(String(64))
|
||||
path: Mapped[str] = mapped_column(String(512))
|
||||
details: Mapped[dict] = mapped_column(JSON, default=dict)
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=utcnow)
|
||||
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .config import settings
|
||||
|
||||
|
||||
def run_ingress_command(command: str) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
[
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-o",
|
||||
"UserKnownHostsFile=NUL",
|
||||
"-i",
|
||||
str(settings.ssh_key_path),
|
||||
"-p",
|
||||
str(settings.ingress_ssh_port),
|
||||
f"{settings.ingress_ssh_user}@{settings.ingress_ssh_host}",
|
||||
command,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
|
||||
def apply_route(hostname: str, scheme: str, target_host: str, target_port: int) -> dict:
|
||||
payload = json.dumps(
|
||||
{"hostname": hostname, "scheme": scheme, "target_host": target_host, "target_port": target_port}
|
||||
)
|
||||
result = run_ingress_command(
|
||||
f"sudo {settings.ingress_route_helper} upsert '{payload}' && sudo systemctl reload caddy"
|
||||
)
|
||||
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
|
||||
|
||||
|
||||
def remove_route(hostname: str) -> dict:
|
||||
result = run_ingress_command(
|
||||
f"sudo {settings.ingress_route_helper} delete {hostname} && sudo systemctl reload caddy"
|
||||
)
|
||||
return {"stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode}
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import Depends, HTTPException, Request, status
|
||||
from passlib.context import CryptContext
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .database import get_db
|
||||
from .models import User
|
||||
|
||||
|
||||
pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
|
||||
|
||||
|
||||
def hash_password(password: str) -> str:
|
||||
return pwd_context.hash(password)
|
||||
|
||||
|
||||
def verify_password(password: str, password_hash: str) -> bool:
|
||||
return pwd_context.verify(password, password_hash)
|
||||
|
||||
|
||||
def get_current_user(request: Request, db: Session = Depends(get_db)) -> User:
|
||||
username = request.session.get("username")
|
||||
if not username:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
|
||||
user = db.scalar(select(User).where(User.username == username, User.is_active.is_(True)))
|
||||
if not user:
|
||||
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
|
||||
return user
|
||||
160
infrastructure/ops_control_plane/app/ops_control_plane/seed.py
Normal file
160
infrastructure/ops_control_plane/app/ops_control_plane/seed.py
Normal file
@@ -0,0 +1,160 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .config import settings
|
||||
from .models import MachineProfile, ModelCatalog, User, WorkloadProfile
|
||||
from .security import hash_password
|
||||
|
||||
|
||||
DEFAULT_MACHINE_PROFILES = [
|
||||
{
|
||||
"name": "t4g-micro-ingress",
|
||||
"region": "us-east-1",
|
||||
"instance_type": "t4g.micro",
|
||||
"gpu_label": "Ingress CPU",
|
||||
"vcpu": 2,
|
||||
"memory_gib": 1.0,
|
||||
"preferred_lifecycle": "on-demand",
|
||||
"intended_workloads": ["ingress"],
|
||||
},
|
||||
{
|
||||
"name": "g6-xlarge",
|
||||
"region": "us-east-1",
|
||||
"instance_type": "g6.xlarge",
|
||||
"gpu_label": "1x NVIDIA L4",
|
||||
"vcpu": 4,
|
||||
"memory_gib": 16.0,
|
||||
"preferred_lifecycle": "spot",
|
||||
"intended_workloads": ["light-comfy", "qwen-edit"],
|
||||
},
|
||||
{
|
||||
"name": "g6-2xlarge",
|
||||
"region": "us-east-1",
|
||||
"instance_type": "g6.2xlarge",
|
||||
"gpu_label": "1x NVIDIA L4",
|
||||
"vcpu": 8,
|
||||
"memory_gib": 32.0,
|
||||
"preferred_lifecycle": "spot",
|
||||
"intended_workloads": ["comfyui", "qwen-edit"],
|
||||
},
|
||||
{
|
||||
"name": "g6-4xlarge",
|
||||
"region": "us-east-1",
|
||||
"instance_type": "g6.4xlarge",
|
||||
"gpu_label": "1x NVIDIA L4",
|
||||
"vcpu": 16,
|
||||
"memory_gib": 64.0,
|
||||
"preferred_lifecycle": "spot",
|
||||
"intended_workloads": ["comfyui", "wan-video", "qwen-edit"],
|
||||
},
|
||||
{
|
||||
"name": "g6-12xlarge",
|
||||
"region": "us-east-1",
|
||||
"instance_type": "g6.12xlarge",
|
||||
"gpu_label": "4x NVIDIA L4",
|
||||
"vcpu": 48,
|
||||
"memory_gib": 192.0,
|
||||
"preferred_lifecycle": "spot",
|
||||
"intended_workloads": ["comfyui", "batch-storyboard", "qwen-edit", "multi-gpu"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
DEFAULT_WORKLOADS = [
|
||||
{
|
||||
"name": "comfyui",
|
||||
"service_type": "systemd",
|
||||
"model_requirements": [],
|
||||
"default_port": 8188,
|
||||
"start_command": "sudo systemctl start comfyui",
|
||||
"stop_command": "sudo systemctl stop comfyui",
|
||||
"healthcheck_path": "/",
|
||||
"route_hostname": "comfy.desineuron.in",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
DEFAULT_MODELS = [
|
||||
{
|
||||
"model_key": "qwen-image-edit-2511",
|
||||
"label": "Qwen Image Edit 2511",
|
||||
"s3_prefix": "models/qwen-image-edit-2511/",
|
||||
"compatibility_tags": ["qwen", "image-edit"],
|
||||
"workload_tags": ["comfyui", "qwen-edit"],
|
||||
},
|
||||
{
|
||||
"model_key": "qwen-image-2512",
|
||||
"label": "Qwen Image 2512",
|
||||
"s3_prefix": "models/qwen-image-2512/",
|
||||
"compatibility_tags": ["qwen", "image"],
|
||||
"workload_tags": ["comfyui", "qwen-image"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def seed_defaults(db: Session) -> None:
|
||||
if not db.scalar(select(User).where(User.username == settings.admin_username)):
|
||||
db.add(
|
||||
User(
|
||||
username=settings.admin_username,
|
||||
password_hash=hash_password(settings.admin_password),
|
||||
role="admin",
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
team_users = json.loads(settings.team_users_json)
|
||||
except json.JSONDecodeError:
|
||||
team_users = []
|
||||
for row in team_users:
|
||||
username = row.get("username")
|
||||
password = row.get("password")
|
||||
role = row.get("role", "operator")
|
||||
if not username or not password:
|
||||
continue
|
||||
existing_user = db.scalar(select(User).where(User.username == username))
|
||||
if existing_user:
|
||||
existing_user.role = role
|
||||
existing_user.is_active = True
|
||||
if row.get("reset_password"):
|
||||
existing_user.password_hash = hash_password(password)
|
||||
continue
|
||||
db.add(User(username=username, password_hash=hash_password(password), role=role))
|
||||
|
||||
for profile in DEFAULT_MACHINE_PROFILES:
|
||||
existing = db.scalar(select(MachineProfile).where(MachineProfile.name == profile["name"]))
|
||||
if existing:
|
||||
existing.launch_config = {
|
||||
"ami_id": settings.gpu_ami_id,
|
||||
"subnet_id": settings.gpu_subnet_id,
|
||||
"security_group_ids": list(settings.gpu_security_group_ids),
|
||||
"key_name": settings.gpu_key_name,
|
||||
"instance_profile": settings.gpu_instance_profile,
|
||||
"root_volume_gb": settings.gpu_root_volume_gb,
|
||||
}
|
||||
continue
|
||||
db.add(
|
||||
MachineProfile(
|
||||
**profile,
|
||||
launch_config={
|
||||
"ami_id": settings.gpu_ami_id,
|
||||
"subnet_id": settings.gpu_subnet_id,
|
||||
"security_group_ids": list(settings.gpu_security_group_ids),
|
||||
"key_name": settings.gpu_key_name,
|
||||
"instance_profile": settings.gpu_instance_profile,
|
||||
"root_volume_gb": settings.gpu_root_volume_gb,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
for workload in DEFAULT_WORKLOADS:
|
||||
if not db.scalar(select(WorkloadProfile).where(WorkloadProfile.name == workload["name"])):
|
||||
db.add(WorkloadProfile(**workload))
|
||||
|
||||
for model in DEFAULT_MODELS:
|
||||
if not db.scalar(select(ModelCatalog).where(ModelCatalog.model_key == model["model_key"])):
|
||||
db.add(ModelCatalog(**model))
|
||||
@@ -0,0 +1,209 @@
|
||||
html{color-scheme:dark}
|
||||
body{
|
||||
font-family:Segoe UI,system-ui,sans-serif;
|
||||
background:
|
||||
radial-gradient(circle at top right, rgba(220,38,38,.18), transparent 28%),
|
||||
radial-gradient(circle at left 20%, rgba(239,68,68,.09), transparent 24%),
|
||||
linear-gradient(180deg, #020202 0%, #070707 100%);
|
||||
color:#f5f5f5;
|
||||
margin:0;
|
||||
min-height:100vh;
|
||||
}
|
||||
.hud-grid{
|
||||
position:fixed;
|
||||
inset:0;
|
||||
pointer-events:none;
|
||||
background-image:
|
||||
linear-gradient(rgba(255,255,255,.02) 1px, transparent 1px),
|
||||
linear-gradient(90deg, rgba(255,255,255,.02) 1px, transparent 1px);
|
||||
background-size:32px 32px;
|
||||
mask-image:linear-gradient(180deg, rgba(0,0,0,.35), rgba(0,0,0,.85));
|
||||
}
|
||||
.topbar{
|
||||
position:sticky;
|
||||
top:0;
|
||||
z-index:10;
|
||||
display:flex;
|
||||
justify-content:space-between;
|
||||
align-items:center;
|
||||
padding:22px 30px;
|
||||
background:rgba(10,10,10,.9);
|
||||
backdrop-filter:blur(18px);
|
||||
border-bottom:1px solid rgba(255,255,255,.07);
|
||||
box-shadow:0 10px 40px rgba(0,0,0,.4);
|
||||
}
|
||||
.topbar h1{
|
||||
margin:0;
|
||||
font-size:24px;
|
||||
letter-spacing:.04em;
|
||||
text-transform:uppercase;
|
||||
}
|
||||
.topbar p{
|
||||
margin:5px 0 0;
|
||||
color:#b8b8b8;
|
||||
max-width:760px;
|
||||
}
|
||||
.topbar-actions{
|
||||
display:flex;
|
||||
gap:12px;
|
||||
align-items:center;
|
||||
}
|
||||
.user-chip{
|
||||
display:inline-flex;
|
||||
align-items:center;
|
||||
padding:8px 12px;
|
||||
border:1px solid rgba(248,113,113,.45);
|
||||
border-radius:999px;
|
||||
color:#fca5a5;
|
||||
background:rgba(127,29,29,.22);
|
||||
box-shadow:0 0 24px rgba(220,38,38,.15) inset;
|
||||
}
|
||||
.topbar-actions a,.button,button{
|
||||
display:inline-flex;
|
||||
align-items:center;
|
||||
justify-content:center;
|
||||
gap:8px;
|
||||
background:linear-gradient(180deg, #ef4444 0%, #991b1b 100%);
|
||||
color:#fff;
|
||||
border:1px solid rgba(248,113,113,.5);
|
||||
border-radius:12px;
|
||||
padding:10px 14px;
|
||||
text-decoration:none;
|
||||
cursor:pointer;
|
||||
box-shadow:0 0 24px rgba(220,38,38,.18);
|
||||
}
|
||||
.button.secondary,button.secondary{
|
||||
background:rgba(255,255,255,.04);
|
||||
border-color:rgba(255,255,255,.14);
|
||||
color:#fff;
|
||||
box-shadow:none;
|
||||
}
|
||||
.button.danger,button.danger{
|
||||
background:linear-gradient(180deg, #dc2626 0%, #7f1d1d 100%);
|
||||
}
|
||||
.page{
|
||||
position:relative;
|
||||
padding:26px;
|
||||
}
|
||||
.grid{display:grid;gap:20px}
|
||||
.grid.two{grid-template-columns:repeat(2,minmax(0,1fr))}
|
||||
.grid.three{grid-template-columns:repeat(3,minmax(0,1fr))}
|
||||
.summary-grid{display:grid;grid-template-columns:repeat(4,minmax(0,1fr));gap:20px;margin-bottom:20px}
|
||||
.card{
|
||||
position:relative;
|
||||
overflow:hidden;
|
||||
background:linear-gradient(180deg, rgba(16,16,16,.88) 0%, rgba(8,8,8,.92) 100%);
|
||||
border:1px solid rgba(255,255,255,.08);
|
||||
border-radius:20px;
|
||||
padding:22px;
|
||||
margin-bottom:20px;
|
||||
box-shadow:
|
||||
0 16px 40px rgba(0,0,0,.45),
|
||||
0 0 0 1px rgba(255,255,255,.02) inset;
|
||||
}
|
||||
.card::after{
|
||||
content:"";
|
||||
position:absolute;
|
||||
inset:auto -20% -60% auto;
|
||||
width:180px;
|
||||
height:180px;
|
||||
background:radial-gradient(circle, rgba(220,38,38,.16), transparent 65%);
|
||||
pointer-events:none;
|
||||
}
|
||||
.card h2{
|
||||
margin:0 0 16px;
|
||||
font-size:18px;
|
||||
letter-spacing:.04em;
|
||||
text-transform:uppercase;
|
||||
}
|
||||
.card.narrow{max-width:460px;margin:90px auto}
|
||||
.card.stat strong{
|
||||
display:block;
|
||||
font-size:30px;
|
||||
margin:8px 0;
|
||||
color:#fff;
|
||||
}
|
||||
.eyebrow{
|
||||
color:#f87171;
|
||||
font-size:11px;
|
||||
letter-spacing:.18em;
|
||||
text-transform:uppercase;
|
||||
}
|
||||
.flash{
|
||||
display:flex;
|
||||
gap:12px;
|
||||
align-items:center;
|
||||
}
|
||||
.flash.success{
|
||||
border-color:rgba(248,113,113,.35);
|
||||
background:linear-gradient(180deg, rgba(127,29,29,.25) 0%, rgba(18,18,18,.95) 100%);
|
||||
}
|
||||
.flash.error{
|
||||
border-color:rgba(248,113,113,.6);
|
||||
background:linear-gradient(180deg, rgba(69,10,10,.55) 0%, rgba(18,18,18,.95) 100%);
|
||||
}
|
||||
.stack{display:grid;gap:12px}
|
||||
.action-stack{display:grid;gap:8px}
|
||||
.plain-list{padding-left:18px;margin:0;display:grid;gap:8px;color:#d6d6d6}
|
||||
.kv-list{display:grid;gap:10px}
|
||||
.kv-list div{display:flex;justify-content:space-between;gap:12px}
|
||||
.checkbox-row{
|
||||
display:flex;
|
||||
align-items:center;
|
||||
gap:10px;
|
||||
color:#f5f5f5;
|
||||
}
|
||||
label{display:grid;gap:6px;color:#d0d0d0}
|
||||
input,select{
|
||||
padding:11px 12px;
|
||||
border-radius:12px;
|
||||
border:1px solid rgba(255,255,255,.12);
|
||||
background:rgba(255,255,255,.03);
|
||||
color:#fff;
|
||||
outline:none;
|
||||
}
|
||||
input:focus,select:focus{
|
||||
border-color:rgba(248,113,113,.75);
|
||||
box-shadow:0 0 0 3px rgba(220,38,38,.16);
|
||||
}
|
||||
table{width:100%;border-collapse:collapse}
|
||||
th,td{
|
||||
padding:12px 10px;
|
||||
border-bottom:1px solid rgba(255,255,255,.08);
|
||||
text-align:left;
|
||||
vertical-align:top;
|
||||
}
|
||||
th{
|
||||
color:#fca5a5;
|
||||
font-weight:600;
|
||||
font-size:12px;
|
||||
letter-spacing:.08em;
|
||||
text-transform:uppercase;
|
||||
}
|
||||
.pill{
|
||||
display:inline-block;
|
||||
padding:4px 10px;
|
||||
border-radius:999px;
|
||||
font-size:12px;
|
||||
background:rgba(255,255,255,.06);
|
||||
color:#f3f3f3;
|
||||
}
|
||||
.pill.available{
|
||||
background:rgba(127,29,29,.45);
|
||||
color:#fecaca;
|
||||
border:1px solid rgba(248,113,113,.3);
|
||||
}
|
||||
.pill.unavailable{
|
||||
background:rgba(31,31,31,.9);
|
||||
color:#d4d4d4;
|
||||
}
|
||||
.pill.unknown{
|
||||
background:rgba(55,65,81,.5);
|
||||
color:#e5e7eb;
|
||||
}
|
||||
.muted{color:#a3a3a3;font-size:12px}
|
||||
.error{color:#fca5a5}
|
||||
|
||||
@media (max-width: 1100px){
|
||||
.grid.two,.grid.three,.summary-grid{grid-template-columns:1fr}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{{ title or "Desineuron Ops" }}</title>
|
||||
<link rel="stylesheet" href="/static/style.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="hud-grid" aria-hidden="true"></div>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>Desineuron Ops Control Plane</h1>
|
||||
<p>Linux-hosted AWS control surface for machines, models, routes, and cost</p>
|
||||
</div>
|
||||
{% if user %}
|
||||
<div class="topbar-actions">
|
||||
<span class="user-chip">{{ user.username }}</span>
|
||||
<a href="/logout">Logout</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
</header>
|
||||
<main class="page">
|
||||
{% block content %}{% endblock %}
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,355 @@
|
||||
{% extends "base.html" %}
|
||||
{% block content %}
|
||||
{% if flash %}
|
||||
<section class="card flash {{ flash.level }}">
|
||||
<strong>{{ flash.level|capitalize }}</strong>
|
||||
<span>{{ flash.message }}</span>
|
||||
</section>
|
||||
{% endif %}
|
||||
|
||||
<section class="summary-grid">
|
||||
<article class="card stat">
|
||||
<span class="eyebrow">Machines</span>
|
||||
<strong>{{ summary.machine_count }}</strong>
|
||||
<span class="muted">Known AWS nodes</span>
|
||||
</article>
|
||||
<article class="card stat">
|
||||
<span class="eyebrow">Hourly Burn</span>
|
||||
<strong>${{ summary.hourly_burn_usd }}</strong>
|
||||
<span class="muted">Estimated live blended hourly cost</span>
|
||||
</article>
|
||||
<article class="card stat">
|
||||
<span class="eyebrow">24h Cost</span>
|
||||
<strong>${{ summary.last_24h_usd }}</strong>
|
||||
<span class="muted">Rolling 24 hour estimate</span>
|
||||
</article>
|
||||
<article class="card stat">
|
||||
<span class="eyebrow">30d Cost</span>
|
||||
<strong>${{ summary.last_30d_usd }}</strong>
|
||||
<span class="muted">Rolling 30 day estimate</span>
|
||||
</article>
|
||||
</section>
|
||||
|
||||
<div class="grid three">
|
||||
<section class="card">
|
||||
<h2>Control Surface</h2>
|
||||
<div class="kv-list">
|
||||
<div><span>Bucket</span><strong>{{ bucket_name or "not configured" }}</strong></div>
|
||||
<div><span>Visible regions</span><strong>{{ regions|join(", ") }}</strong></div>
|
||||
<div><span>Active sessions</span><strong>{{ summary.active_sessions }}</strong></div>
|
||||
<div><span>Active jobs</span><strong>{{ summary.active_jobs }}</strong></div>
|
||||
<div><span>Active routes</span><strong>{{ summary.routes_active }}</strong></div>
|
||||
<div><span>Fleet est. cost</span><strong>${{ summary.fleet_estimated_cost_usd }}</strong></div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Launch Machine</h2>
|
||||
<form method="post" action="/api/machines/launch" class="stack">
|
||||
<label>Profile
|
||||
<select name="profile_name">
|
||||
{% for profile in profiles %}
|
||||
<option value="{{ profile.name }}">{{ profile.name }} | {{ profile.instance_type }} | {{ profile.gpu_label }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label>Lifecycle
|
||||
<select name="lifecycle">
|
||||
<option value="spot">spot</option>
|
||||
<option value="on-demand">on-demand</option>
|
||||
</select>
|
||||
</label>
|
||||
<button type="submit">Launch Selected Machine</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Runbooks</h2>
|
||||
<ul class="plain-list">
|
||||
<li>1. Launch preferred GPU profile.</li>
|
||||
<li>2. Hydrate required model from S3.</li>
|
||||
<li>3. Start workload and optionally map route.</li>
|
||||
<li>4. Monitor runtime and estimated cost.</li>
|
||||
<li>5. Stop or terminate the node when done.</li>
|
||||
</ul>
|
||||
<a class="button secondary" href="/api/exports/csv">Export Sessions CSV</a>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<section class="card">
|
||||
<h2>Markets</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Profile</th>
|
||||
<th>Instance</th>
|
||||
<th>GPU</th>
|
||||
<th>vCPU / RAM</th>
|
||||
<th>Region</th>
|
||||
<th>On-Demand</th>
|
||||
<th>Spot</th>
|
||||
<th>Preferred Use</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for profile in profiles %}
|
||||
{% for region in regions %}
|
||||
{% set ns = namespace(on_demand='-', on_demand_status='unknown', spot='-', spot_status='unknown') %}
|
||||
{% for market in market_rows %}
|
||||
{% if market.region == region and market.instance_type == profile.instance_type and market.lifecycle == 'on-demand' %}
|
||||
{% set ns.on_demand = '$' ~ market.hourly_price_usd if market.hourly_price_usd is not none else '-' %}
|
||||
{% set ns.on_demand_status = 'available' if market.offering_available else 'unavailable' %}
|
||||
{% endif %}
|
||||
{% if market.region == region and market.instance_type == profile.instance_type and market.lifecycle == 'spot' %}
|
||||
{% set ns.spot = '$' ~ market.hourly_price_usd if market.hourly_price_usd is not none else '-' %}
|
||||
{% set ns.spot_status = 'available' if market.offering_available else 'unavailable' %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
<tr>
|
||||
<td>{{ profile.name }}</td>
|
||||
<td>{{ profile.instance_type }}</td>
|
||||
<td>{{ profile.gpu_label }}</td>
|
||||
<td>{{ profile.vcpu }} / {{ profile.memory_gib }} GiB</td>
|
||||
<td>{{ region }}</td>
|
||||
<td><span class="pill {{ ns.on_demand_status }}">{{ ns.on_demand }}</span></td>
|
||||
<td><span class="pill {{ ns.spot_status }}">{{ ns.spot }}</span></td>
|
||||
<td>{{ profile.intended_workloads|join(", ") }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Machines</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Type</th>
|
||||
<th>State</th>
|
||||
<th>IPs</th>
|
||||
<th>Runtime</th>
|
||||
<th>Cost</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for machine in machines %}
|
||||
<tr>
|
||||
<td>
|
||||
<strong>{{ machine.name }}</strong>
|
||||
<div class="muted">{{ machine.aws_instance_id }}</div>
|
||||
</td>
|
||||
<td>
|
||||
<div>{{ machine.instance_type }}</div>
|
||||
<div class="muted">{{ machine.lifecycle }} / {{ machine.region }}</div>
|
||||
</td>
|
||||
<td>{{ machine.state }}</td>
|
||||
<td>
|
||||
<div>{{ machine.public_ip or "-" }}</div>
|
||||
<div class="muted">{{ machine.private_ip or "-" }}</div>
|
||||
</td>
|
||||
<td>{{ costs[machine.aws_instance_id].runtime_hours if machine.aws_instance_id in costs else "-" }} h</td>
|
||||
<td>
|
||||
<div>${{ costs[machine.aws_instance_id].total_cost_usd if machine.aws_instance_id in costs else "-" }}</div>
|
||||
<div class="muted">${{ costs[machine.aws_instance_id].hourly_price_usd if machine.aws_instance_id in costs else "-" }}/hr</div>
|
||||
</td>
|
||||
<td>
|
||||
<div class="action-stack">
|
||||
<form method="post" action="/api/machines/{{ machine.id }}/stop">
|
||||
<button type="submit" class="button secondary">Stop</button>
|
||||
</form>
|
||||
<form method="post" action="/api/machines/{{ machine.id }}/terminate">
|
||||
<button type="submit" class="button danger">Terminate</button>
|
||||
</form>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<div class="grid two">
|
||||
<section class="card">
|
||||
<h2>Model Library Ingest</h2>
|
||||
<form method="post" action="/api/models/register" class="stack">
|
||||
<label>Model Key <input type="text" name="model_key" placeholder="qwen-image-edit-2511" required></label>
|
||||
<label>Label <input type="text" name="label" placeholder="Qwen Image Edit 2511" required></label>
|
||||
<label>Source Path Under Linux Model Library <input type="text" name="source_relative_path" placeholder="Qwen-Image-Edit-2511" required></label>
|
||||
<label>Workload Tags <input type="text" name="workload_tags" placeholder="comfyui, qwen-edit"></label>
|
||||
<label>Compatibility Tags <input type="text" name="compatibility_tags" placeholder="qwen, image-edit"></label>
|
||||
<button type="submit">Upload to S3 + Generate Manifest</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Hydrate Model</h2>
|
||||
<form method="post" action="/api/models/hydrate" class="stack">
|
||||
<label>Machine
|
||||
<select name="machine_id">
|
||||
{% for machine in machines %}
|
||||
<option value="{{ machine.id }}">{{ machine.name }} ({{ machine.aws_instance_id }})</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label>Model
|
||||
<select name="model_key">
|
||||
{% for model in models %}
|
||||
<option value="{{ model.model_key }}">{{ model.label }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<button type="submit">Hydrate from S3</button>
|
||||
</form>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Start Workload</h2>
|
||||
<form method="post" action="/api/workloads/start" class="stack">
|
||||
<label>Machine
|
||||
<select name="machine_id">
|
||||
{% for machine in machines %}
|
||||
<option value="{{ machine.id }}">{{ machine.name }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label>Workload
|
||||
<select name="workload_name">
|
||||
{% for workload in workloads %}
|
||||
<option value="{{ workload.name }}">{{ workload.name }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</label>
|
||||
<label class="checkbox-row"><input type="checkbox" name="auto_route" value="true"> Auto-map workload hostname via ingress</label>
|
||||
<button type="submit">Start Workload</button>
|
||||
</form>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<section class="card">
|
||||
<h2>Registered Models</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Model</th><th>S3 Prefix</th><th>Size</th><th>Files</th><th>Tags</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for model in models %}
|
||||
<tr>
|
||||
<td>
|
||||
<strong>{{ model.label }}</strong>
|
||||
<div class="muted">{{ model.model_key }}</div>
|
||||
</td>
|
||||
<td>{{ model.s3_prefix }}</td>
|
||||
<td>{{ model.size_gb or "-" }} GiB</td>
|
||||
<td>{{ model.expected_manifest.file_count if model.expected_manifest else "-" }}</td>
|
||||
<td>
|
||||
<div>{{ model.workload_tags|join(", ") }}</div>
|
||||
<div class="muted">{{ model.compatibility_tags|join(", ") }}</div>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<div class="grid two">
|
||||
<section class="card">
|
||||
<h2>Route Management</h2>
|
||||
<form method="post" action="/api/routes/map" class="stack">
|
||||
<label>Hostname <input type="text" name="hostname" placeholder="gpu-ui.desineuron.in" required></label>
|
||||
<label>Scheme
|
||||
<select name="scheme">
|
||||
<option value="http">http</option>
|
||||
<option value="https">https</option>
|
||||
</select>
|
||||
</label>
|
||||
<label>Target Host <input type="text" name="target_host" placeholder="172.31.x.x" required></label>
|
||||
<label>Target Port <input type="number" name="target_port" value="8188" required></label>
|
||||
<button type="submit">Map Route</button>
|
||||
</form>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Hostname</th><th>Target</th><th>Status</th><th>Action</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for route in routes %}
|
||||
<tr>
|
||||
<td>{{ route.hostname }}</td>
|
||||
<td>{{ route.scheme }}://{{ route.target_host }}:{{ route.target_port }}</td>
|
||||
<td>{{ route.status }}</td>
|
||||
<td>
|
||||
<form method="post" action="/api/routes/unmap">
|
||||
<input type="hidden" name="hostname" value="{{ route.hostname }}">
|
||||
<button type="submit" class="button secondary">Unmap</button>
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Recent Sessions</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Actor</th><th>Workload</th><th>Status</th><th>Started</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for session in sessions %}
|
||||
<tr>
|
||||
<td>{{ session.actor }}</td>
|
||||
<td>{{ session.workload_name }}</td>
|
||||
<td>{{ session.status }}</td>
|
||||
<td>{{ session.started_at }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<div class="grid two">
|
||||
<section class="card">
|
||||
<h2>Recent Jobs</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>ID</th><th>Type</th><th>Status</th><th>Actor</th><th>Created</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for job in jobs %}
|
||||
<tr>
|
||||
<td>{{ job.id }}</td>
|
||||
<td>{{ job.job_type }}</td>
|
||||
<td>{{ job.status }}</td>
|
||||
<td>{{ job.actor or "-" }}</td>
|
||||
<td>{{ job.created_at }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
|
||||
<section class="card">
|
||||
<h2>Audit</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Actor</th><th>Action</th><th>Entity</th><th>Time</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for event in audits %}
|
||||
<tr>
|
||||
<td>{{ event.actor or "-" }}</td>
|
||||
<td>{{ event.action }}</td>
|
||||
<td>{{ event.entity_type }} / {{ event.entity_id }}</td>
|
||||
<td>{{ event.created_at }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
</div>
|
||||
{% endblock %}
|
||||
@@ -0,0 +1,14 @@
|
||||
{% extends "base.html" %}
|
||||
{% block content %}
|
||||
<section class="card narrow">
|
||||
<p class="eyebrow">Private Surface</p>
|
||||
<h2>Login</h2>
|
||||
<p class="muted">Use your Desineuron operator account.</p>
|
||||
{% if error %}<p class="error">{{ error }}</p>{% endif %}
|
||||
<form method="post" action="/login" class="stack">
|
||||
<label>Email or username <input type="text" name="username" required></label>
|
||||
<label>Password <input type="password" name="password" required></label>
|
||||
<button type="submit">Enter Ops Console</button>
|
||||
</form>
|
||||
</section>
|
||||
{% endblock %}
|
||||
@@ -0,0 +1,50 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from .aws_control import latest_market_price, refresh_market_snapshots, sync_instances, upsert_session_cost
|
||||
from .database import Base, engine, session_scope
|
||||
from .models import Machine, MachineProfile, Session as RuntimeSession
|
||||
from .seed import seed_defaults
|
||||
|
||||
|
||||
def run_worker() -> None:
|
||||
Base.metadata.create_all(bind=engine)
|
||||
last_market_refresh: datetime | None = None
|
||||
while True:
|
||||
with session_scope() as db:
|
||||
seed_defaults(db)
|
||||
profiles = db.scalars(select(MachineProfile)).all()
|
||||
sync_instances(db, {profile.region for profile in profiles})
|
||||
running_machines = db.scalars(select(Machine).where(Machine.state == "running")).all()
|
||||
for machine in running_machines:
|
||||
active_session = db.scalar(
|
||||
select(RuntimeSession).where(RuntimeSession.machine_id == machine.id, RuntimeSession.status == "active")
|
||||
)
|
||||
if not active_session:
|
||||
db.add(
|
||||
RuntimeSession(
|
||||
machine_id=machine.id,
|
||||
actor="system-import",
|
||||
workload_name=machine.profile_name or machine.instance_type,
|
||||
status="active",
|
||||
notes="Imported from existing running machine state",
|
||||
)
|
||||
)
|
||||
if last_market_refresh is None or datetime.now(timezone.utc) - last_market_refresh > timedelta(minutes=15):
|
||||
refresh_market_snapshots(db, {profile.region for profile in profiles}, profiles)
|
||||
last_market_refresh = datetime.now(timezone.utc)
|
||||
sessions = db.scalars(select(RuntimeSession).where(RuntimeSession.status == "active")).all()
|
||||
for session_row in sessions:
|
||||
if session_row.machine_id:
|
||||
machine = db.get(Machine, session_row.machine_id)
|
||||
if machine:
|
||||
upsert_session_cost(db, session_row, machine)
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_worker()
|
||||
13
infrastructure/ops_control_plane/app/requirements.txt
Normal file
13
infrastructure/ops_control_plane/app/requirements.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
fastapi==0.116.1
|
||||
uvicorn[standard]==0.35.0
|
||||
sqlalchemy==2.0.43
|
||||
psycopg[binary]==3.2.10
|
||||
jinja2==3.1.6
|
||||
python-multipart==0.0.20
|
||||
itsdangerous==2.2.0
|
||||
passlib[bcrypt]==1.7.4
|
||||
boto3==1.40.35
|
||||
httpx==0.28.1
|
||||
typer==0.16.1
|
||||
python-dateutil==2.9.0.post0
|
||||
|
||||
Reference in New Issue
Block a user