feat: Oracle Canvas, Revision History and Canvas Sharing

This commit is contained in:
Sagnik
2026-04-23 01:19:03 +05:30
parent e519339cc9
commit 527b10cd41
58 changed files with 3187 additions and 705 deletions

View File

@@ -1,10 +1,13 @@
"""
backend/services/nemoclaw_client.py - NemoClaw inference client.
Primary path:
1. NVIDIA-hosted OpenAI-compatible chat completions.
2. Optional compatible endpoint via NEMOCLAW_BASE_URL.
3. Optional local Ollama fallback only when ALLOW_LOCAL_FALLBACK=true.
Production path:
1. Shared SGLang / OpenAI-compatible coding runtime.
Compatibility:
- Legacy NEMOCLAW_* env names are still honored.
- Legacy OLLAMA_BASE_URL can still seed the base URL, but Ollama is no longer
a production fallback path.
"""
from __future__ import annotations
@@ -24,28 +27,23 @@ logger = logging.getLogger("velocity.nemoclaw")
NEMOCLAW_TIMEOUT = float(os.getenv("NEMOCLAW_TIMEOUT_S", "45.0"))
NEMOCLAW_TEMPERATURE = float(os.getenv("NEMOCLAW_TEMPERATURE", "0.2"))
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY", "")
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1")
NVIDIA_CHAT_URL = os.getenv("NVIDIA_CHAT_URL", f"{NVIDIA_BASE_URL}/chat/completions")
NVIDIA_MODEL = os.getenv("NVIDIA_MODEL", "nvidia/nemotron-3-super-120b-a12b")
NVIDIA_FALLBACK_MODEL = os.getenv(
"NVIDIA_FALLBACK_MODEL",
"nvidia/llama-3.3-nemotron-super-49b-v1",
SGLANG_BASE_URL = os.getenv(
"SGLANG_BASE_URL",
os.getenv(
"NEMOCLAW_BASE_URL",
os.getenv("LLM_BASE_URL", os.getenv("OLLAMA_BASE_URL", "https://llm.desineuron.in")),
),
).rstrip("/")
SGLANG_CHAT_URL = os.getenv(
"SGLANG_CHAT_URL",
os.getenv("NEMOCLAW_CHAT_URL", f"{SGLANG_BASE_URL}/v1/chat/completions"),
)
NEMOCLAW_BASE_URL = os.getenv("NEMOCLAW_BASE_URL", "")
NEMOCLAW_CHAT_URL = (
os.getenv("NEMOCLAW_CHAT_URL") or f"{NEMOCLAW_BASE_URL}/v1/chat/completions"
if NEMOCLAW_BASE_URL
else ""
SGLANG_MODELS_URL = os.getenv("SGLANG_MODELS_URL", f"{SGLANG_BASE_URL}/v1/models")
SGLANG_MODEL = os.getenv(
"SGLANG_MODEL",
os.getenv("NEMOCLAW_MODEL", os.getenv("OLLAMA_MODEL", "qwen3.6:35b-a3b")),
)
NEMOCLAW_MODEL = os.getenv("NEMOCLAW_MODEL", NVIDIA_MODEL)
NEMOCLAW_API_TOKEN = os.getenv("NEMOCLAW_API_TOKEN", "")
ALLOW_LOCAL_FALLBACK = os.getenv("ALLOW_LOCAL_FALLBACK", "false").lower() == "true"
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434")
OLLAMA_CHAT_URL = f"{OLLAMA_BASE_URL}/v1/chat/completions"
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:27b")
SGLANG_API_TOKEN = os.getenv("SGLANG_API_TOKEN", os.getenv("NEMOCLAW_API_TOKEN", ""))
_PROMPT_DIR = os.getenv("NEMOCLAW_PROMPT_DIR", "/opt/dlami/nvme/nemoclaw/prompts")
@@ -201,83 +199,40 @@ async def _nemoclaw_chat(
user_content: str,
timeout: float = NEMOCLAW_TIMEOUT,
) -> dict:
endpoints: list[tuple[str, str, str, dict[str, str]]] = []
if NVIDIA_API_KEY:
endpoints.append(
(
"nvidia_primary",
NVIDIA_CHAT_URL,
NVIDIA_MODEL,
{
"Authorization": f"Bearer {NVIDIA_API_KEY}",
"Content-Type": "application/json",
},
)
)
if NVIDIA_FALLBACK_MODEL and NVIDIA_FALLBACK_MODEL != NVIDIA_MODEL:
endpoints.append(
(
"nvidia_fallback",
NVIDIA_CHAT_URL,
NVIDIA_FALLBACK_MODEL,
{
"Authorization": f"Bearer {NVIDIA_API_KEY}",
"Content-Type": "application/json",
},
)
)
if NEMOCLAW_CHAT_URL:
headers = {"Content-Type": "application/json"}
if NEMOCLAW_API_TOKEN:
headers["Authorization"] = f"Bearer {NEMOCLAW_API_TOKEN}"
endpoints.append(("compatible_endpoint", NEMOCLAW_CHAT_URL, NEMOCLAW_MODEL, headers))
if ALLOW_LOCAL_FALLBACK:
endpoints.append(
("ollama_fallback", OLLAMA_CHAT_URL, OLLAMA_MODEL, {"Content-Type": "application/json"})
if not SGLANG_CHAT_URL:
raise RuntimeError(
"No NemoClaw inference endpoint is configured. Set SGLANG_BASE_URL or NEMOCLAW_BASE_URL."
)
if not endpoints:
raise RuntimeError(
"No NemoClaw inference endpoint is configured. "
"Set NVIDIA_API_KEY or NEMOCLAW_BASE_URL."
)
headers = {"Content-Type": "application/json"}
if SGLANG_API_TOKEN:
headers["Authorization"] = f"Bearer {SGLANG_API_TOKEN}"
t_start = time.monotonic()
last_error: Exception | None = None
for label, url, model, headers in endpoints:
try:
result = await _attempt_chat(
label=label,
url=url,
model=model,
system_content=system_content,
user_content=user_content,
timeout=timeout,
headers=headers,
)
logger.info(
"NemoClaw inference via %s model=%s elapsed=%.2fs",
label,
model,
time.monotonic() - t_start,
)
return result
except (httpx.ConnectError, httpx.TimeoutException) as exc:
logger.warning("NemoClaw %s unreachable (%s), trying next endpoint", label, exc)
last_error = exc
except httpx.HTTPStatusError as exc:
logger.error(
"NemoClaw %s HTTP %s: %s",
label,
exc.response.status_code,
exc.response.text[:300],
)
last_error = exc
except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
logger.error("NemoClaw %s returned invalid JSON: %s", label, exc)
last_error = exc
raise RuntimeError(f"All NemoClaw endpoints failed. Last error: {last_error}")
try:
result = await _attempt_chat(
label="sglang",
url=SGLANG_CHAT_URL,
model=SGLANG_MODEL,
system_content=system_content,
user_content=user_content,
timeout=timeout,
headers=headers,
)
logger.info(
"NemoClaw inference via sglang model=%s elapsed=%.2fs",
SGLANG_MODEL,
time.monotonic() - t_start,
)
return result
except (httpx.ConnectError, httpx.TimeoutException) as exc:
raise RuntimeError(f"NemoClaw SGLang endpoint unreachable: {exc}") from exc
except httpx.HTTPStatusError as exc:
raise RuntimeError(
f"NemoClaw SGLang HTTP {exc.response.status_code}: {exc.response.text[:300]}"
) from exc
except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
raise RuntimeError(f"NemoClaw SGLang returned invalid JSON: {exc}") from exc
async def score_qd(
@@ -368,46 +323,32 @@ async def profile_cctv_visitor(
async def health_check() -> dict:
results: dict[str, str] = {}
endpoints: list[tuple[str, str, str, dict[str, str]]] = []
if NVIDIA_API_KEY:
endpoints.append(
(
"nvidia_primary",
NVIDIA_CHAT_URL,
NVIDIA_MODEL,
{
"Authorization": f"Bearer {NVIDIA_API_KEY}",
"Content-Type": "application/json",
headers = {"Content-Type": "application/json"}
if SGLANG_API_TOKEN:
headers["Authorization"] = f"Bearer {SGLANG_API_TOKEN}"
results: dict[str, str] = {
"model": SGLANG_MODEL,
"primary_url": SGLANG_CHAT_URL,
"models_url": SGLANG_MODELS_URL,
}
try:
async with httpx.AsyncClient(timeout=5.0) as client:
models_response = await client.get(SGLANG_MODELS_URL, headers=headers)
models_response.raise_for_status()
chat_response = await client.post(
SGLANG_CHAT_URL,
json={
"model": SGLANG_MODEL,
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 5,
},
headers=headers,
)
)
if NEMOCLAW_CHAT_URL:
headers = {"Content-Type": "application/json"}
if NEMOCLAW_API_TOKEN:
headers["Authorization"] = f"Bearer {NEMOCLAW_API_TOKEN}"
endpoints.append(("compatible_endpoint", NEMOCLAW_CHAT_URL, NEMOCLAW_MODEL, headers))
if ALLOW_LOCAL_FALLBACK:
endpoints.append(
("ollama_fallback", OLLAMA_CHAT_URL, OLLAMA_MODEL, {"Content-Type": "application/json"})
)
chat_response.raise_for_status()
results["sglang"] = "ok"
except Exception as exc:
results["sglang"] = f"error: {exc}"
for name, url, model, headers in endpoints:
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.post(
url,
json={
"model": model,
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 5,
},
headers=headers,
)
results[name] = "ok" if response.status_code < 500 else f"http_{response.status_code}"
except Exception as exc:
results[name] = f"error: {exc}"
results["model"] = NVIDIA_MODEL if NVIDIA_API_KEY else NEMOCLAW_MODEL
results["primary_url"] = NVIDIA_CHAT_URL if NVIDIA_API_KEY else (NEMOCLAW_CHAT_URL or OLLAMA_CHAT_URL)
return results

View File

@@ -13,15 +13,17 @@ import httpx
logger = logging.getLogger("velocity.runtime_llm")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
OLLAMA_CHAT_URL = os.getenv("OLLAMA_CHAT_URL", f"{OLLAMA_BASE_URL}/v1/chat/completions")
OLLAMA_TAGS_URL = os.getenv("OLLAMA_TAGS_URL", f"{OLLAMA_BASE_URL}/api/tags")
OLLAMA_DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:27b")
NEMOCLAW_BASE_URL = os.getenv("NEMOCLAW_BASE_URL", "").rstrip("/")
NEMOCLAW_CHAT_URL = (os.getenv("NEMOCLAW_CHAT_URL") or f"{NEMOCLAW_BASE_URL}/v1/chat/completions").rstrip("/") if NEMOCLAW_BASE_URL else ""
NEMOCLAW_DEFAULT_MODEL = os.getenv("NEMOCLAW_MODEL", "nvidia/nemotron-3-super-120b-a12b")
NEMOCLAW_API_TOKEN = os.getenv("NEMOCLAW_API_TOKEN", "")
SGLANG_BASE_URL = os.getenv(
"SGLANG_BASE_URL",
os.getenv("LLM_BASE_URL", os.getenv("OLLAMA_BASE_URL", "https://llm.desineuron.in")),
).rstrip("/")
SGLANG_CHAT_URL = os.getenv("SGLANG_CHAT_URL", f"{SGLANG_BASE_URL}/v1/chat/completions")
SGLANG_MODELS_URL = os.getenv("SGLANG_MODELS_URL", f"{SGLANG_BASE_URL}/v1/models")
SGLANG_DEFAULT_MODEL = os.getenv(
"SGLANG_MODEL",
os.getenv("OLLAMA_MODEL", "qwen3.6:35b-a3b"),
)
SGLANG_API_TOKEN = os.getenv("SGLANG_API_TOKEN", "")
RUNTIME_LLM_TIMEOUT_S = float(os.getenv("RUNTIME_LLM_TIMEOUT_S", "90.0"))
RUNTIME_LLM_CONCURRENCY = int(os.getenv("RUNTIME_LLM_BATCH_CONCURRENCY", "2"))
@@ -57,40 +59,30 @@ class RuntimeLLMService:
self._jobs: dict[str, dict[str, Any]] = {}
def _provider_catalog(self) -> list[RuntimeProvider]:
providers: list[RuntimeProvider] = []
if OLLAMA_CHAT_URL:
providers.append(
RuntimeProvider(
provider_id="ollama",
base_url=OLLAMA_BASE_URL,
chat_url=OLLAMA_CHAT_URL,
default_model=OLLAMA_DEFAULT_MODEL,
)
if not SGLANG_CHAT_URL:
return []
return [
RuntimeProvider(
provider_id="sglang",
base_url=SGLANG_BASE_URL,
chat_url=SGLANG_CHAT_URL,
default_model=SGLANG_DEFAULT_MODEL,
auth_token=SGLANG_API_TOKEN or None,
)
if NEMOCLAW_CHAT_URL:
providers.append(
RuntimeProvider(
provider_id="nemoclaw",
base_url=NEMOCLAW_BASE_URL,
chat_url=NEMOCLAW_CHAT_URL,
default_model=NEMOCLAW_DEFAULT_MODEL,
auth_token=NEMOCLAW_API_TOKEN or None,
)
)
return providers
]
def get_provider(self, provider_id: str | None) -> RuntimeProvider:
providers = {provider.provider_id: provider for provider in self._provider_catalog()}
if provider_id in {"ollama", "nemoclaw"}:
provider_id = "sglang"
if provider_id:
provider = providers.get(provider_id)
if provider is None:
raise ValueError(f"Unknown provider '{provider_id}'.")
return provider
if "nemoclaw" in providers:
return providers["nemoclaw"]
if "ollama" in providers:
return providers["ollama"]
if "sglang" in providers:
return providers["sglang"]
raise ValueError("No runtime LLM providers are configured.")
async def list_providers(self) -> list[dict[str, Any]]:
@@ -101,28 +93,18 @@ class RuntimeLLMService:
error: str | None = None
try:
if provider.provider_id == "ollama":
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(OLLAMA_TAGS_URL)
response.raise_for_status()
payload = response.json()
models = [str(item.get("name", "")).strip() for item in payload.get("models", []) if item.get("name")]
if provider.default_model not in models:
models.insert(0, provider.default_model)
status = "online"
else:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(
provider.chat_url,
json={
"model": provider.default_model,
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 4,
},
headers=provider.headers,
)
response.raise_for_status()
status = "online"
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(SGLANG_MODELS_URL, headers=provider.headers)
response.raise_for_status()
payload = response.json()
models = [
str(item.get("id", "")).strip()
for item in payload.get("data", [])
if item.get("id")
]
if provider.default_model not in models:
models.insert(0, provider.default_model)
status = "online"
except Exception as exc: # pragma: no cover - network/runtime dependent
error = str(exc)