feat: Oracle Canvas, Revision History and Canvas Sharing
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
"""
|
||||
backend/services/nemoclaw_client.py - NemoClaw inference client.
|
||||
|
||||
Primary path:
|
||||
1. NVIDIA-hosted OpenAI-compatible chat completions.
|
||||
2. Optional compatible endpoint via NEMOCLAW_BASE_URL.
|
||||
3. Optional local Ollama fallback only when ALLOW_LOCAL_FALLBACK=true.
|
||||
Production path:
|
||||
1. Shared SGLang / OpenAI-compatible coding runtime.
|
||||
|
||||
Compatibility:
|
||||
- Legacy NEMOCLAW_* env names are still honored.
|
||||
- Legacy OLLAMA_BASE_URL can still seed the base URL, but Ollama is no longer
|
||||
a production fallback path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -24,28 +27,23 @@ logger = logging.getLogger("velocity.nemoclaw")
|
||||
NEMOCLAW_TIMEOUT = float(os.getenv("NEMOCLAW_TIMEOUT_S", "45.0"))
|
||||
NEMOCLAW_TEMPERATURE = float(os.getenv("NEMOCLAW_TEMPERATURE", "0.2"))
|
||||
|
||||
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY", "")
|
||||
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1")
|
||||
NVIDIA_CHAT_URL = os.getenv("NVIDIA_CHAT_URL", f"{NVIDIA_BASE_URL}/chat/completions")
|
||||
NVIDIA_MODEL = os.getenv("NVIDIA_MODEL", "nvidia/nemotron-3-super-120b-a12b")
|
||||
NVIDIA_FALLBACK_MODEL = os.getenv(
|
||||
"NVIDIA_FALLBACK_MODEL",
|
||||
"nvidia/llama-3.3-nemotron-super-49b-v1",
|
||||
SGLANG_BASE_URL = os.getenv(
|
||||
"SGLANG_BASE_URL",
|
||||
os.getenv(
|
||||
"NEMOCLAW_BASE_URL",
|
||||
os.getenv("LLM_BASE_URL", os.getenv("OLLAMA_BASE_URL", "https://llm.desineuron.in")),
|
||||
),
|
||||
).rstrip("/")
|
||||
SGLANG_CHAT_URL = os.getenv(
|
||||
"SGLANG_CHAT_URL",
|
||||
os.getenv("NEMOCLAW_CHAT_URL", f"{SGLANG_BASE_URL}/v1/chat/completions"),
|
||||
)
|
||||
|
||||
NEMOCLAW_BASE_URL = os.getenv("NEMOCLAW_BASE_URL", "")
|
||||
NEMOCLAW_CHAT_URL = (
|
||||
os.getenv("NEMOCLAW_CHAT_URL") or f"{NEMOCLAW_BASE_URL}/v1/chat/completions"
|
||||
if NEMOCLAW_BASE_URL
|
||||
else ""
|
||||
SGLANG_MODELS_URL = os.getenv("SGLANG_MODELS_URL", f"{SGLANG_BASE_URL}/v1/models")
|
||||
SGLANG_MODEL = os.getenv(
|
||||
"SGLANG_MODEL",
|
||||
os.getenv("NEMOCLAW_MODEL", os.getenv("OLLAMA_MODEL", "qwen3.6:35b-a3b")),
|
||||
)
|
||||
NEMOCLAW_MODEL = os.getenv("NEMOCLAW_MODEL", NVIDIA_MODEL)
|
||||
NEMOCLAW_API_TOKEN = os.getenv("NEMOCLAW_API_TOKEN", "")
|
||||
|
||||
ALLOW_LOCAL_FALLBACK = os.getenv("ALLOW_LOCAL_FALLBACK", "false").lower() == "true"
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434")
|
||||
OLLAMA_CHAT_URL = f"{OLLAMA_BASE_URL}/v1/chat/completions"
|
||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:27b")
|
||||
SGLANG_API_TOKEN = os.getenv("SGLANG_API_TOKEN", os.getenv("NEMOCLAW_API_TOKEN", ""))
|
||||
|
||||
_PROMPT_DIR = os.getenv("NEMOCLAW_PROMPT_DIR", "/opt/dlami/nvme/nemoclaw/prompts")
|
||||
|
||||
@@ -201,83 +199,40 @@ async def _nemoclaw_chat(
|
||||
user_content: str,
|
||||
timeout: float = NEMOCLAW_TIMEOUT,
|
||||
) -> dict:
|
||||
endpoints: list[tuple[str, str, str, dict[str, str]]] = []
|
||||
if NVIDIA_API_KEY:
|
||||
endpoints.append(
|
||||
(
|
||||
"nvidia_primary",
|
||||
NVIDIA_CHAT_URL,
|
||||
NVIDIA_MODEL,
|
||||
{
|
||||
"Authorization": f"Bearer {NVIDIA_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
)
|
||||
if NVIDIA_FALLBACK_MODEL and NVIDIA_FALLBACK_MODEL != NVIDIA_MODEL:
|
||||
endpoints.append(
|
||||
(
|
||||
"nvidia_fallback",
|
||||
NVIDIA_CHAT_URL,
|
||||
NVIDIA_FALLBACK_MODEL,
|
||||
{
|
||||
"Authorization": f"Bearer {NVIDIA_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
)
|
||||
if NEMOCLAW_CHAT_URL:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if NEMOCLAW_API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {NEMOCLAW_API_TOKEN}"
|
||||
endpoints.append(("compatible_endpoint", NEMOCLAW_CHAT_URL, NEMOCLAW_MODEL, headers))
|
||||
if ALLOW_LOCAL_FALLBACK:
|
||||
endpoints.append(
|
||||
("ollama_fallback", OLLAMA_CHAT_URL, OLLAMA_MODEL, {"Content-Type": "application/json"})
|
||||
if not SGLANG_CHAT_URL:
|
||||
raise RuntimeError(
|
||||
"No NemoClaw inference endpoint is configured. Set SGLANG_BASE_URL or NEMOCLAW_BASE_URL."
|
||||
)
|
||||
|
||||
if not endpoints:
|
||||
raise RuntimeError(
|
||||
"No NemoClaw inference endpoint is configured. "
|
||||
"Set NVIDIA_API_KEY or NEMOCLAW_BASE_URL."
|
||||
)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if SGLANG_API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {SGLANG_API_TOKEN}"
|
||||
|
||||
t_start = time.monotonic()
|
||||
last_error: Exception | None = None
|
||||
for label, url, model, headers in endpoints:
|
||||
try:
|
||||
result = await _attempt_chat(
|
||||
label=label,
|
||||
url=url,
|
||||
model=model,
|
||||
system_content=system_content,
|
||||
user_content=user_content,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
)
|
||||
logger.info(
|
||||
"NemoClaw inference via %s model=%s elapsed=%.2fs",
|
||||
label,
|
||||
model,
|
||||
time.monotonic() - t_start,
|
||||
)
|
||||
return result
|
||||
except (httpx.ConnectError, httpx.TimeoutException) as exc:
|
||||
logger.warning("NemoClaw %s unreachable (%s), trying next endpoint", label, exc)
|
||||
last_error = exc
|
||||
except httpx.HTTPStatusError as exc:
|
||||
logger.error(
|
||||
"NemoClaw %s HTTP %s: %s",
|
||||
label,
|
||||
exc.response.status_code,
|
||||
exc.response.text[:300],
|
||||
)
|
||||
last_error = exc
|
||||
except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
|
||||
logger.error("NemoClaw %s returned invalid JSON: %s", label, exc)
|
||||
last_error = exc
|
||||
|
||||
raise RuntimeError(f"All NemoClaw endpoints failed. Last error: {last_error}")
|
||||
try:
|
||||
result = await _attempt_chat(
|
||||
label="sglang",
|
||||
url=SGLANG_CHAT_URL,
|
||||
model=SGLANG_MODEL,
|
||||
system_content=system_content,
|
||||
user_content=user_content,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
)
|
||||
logger.info(
|
||||
"NemoClaw inference via sglang model=%s elapsed=%.2fs",
|
||||
SGLANG_MODEL,
|
||||
time.monotonic() - t_start,
|
||||
)
|
||||
return result
|
||||
except (httpx.ConnectError, httpx.TimeoutException) as exc:
|
||||
raise RuntimeError(f"NemoClaw SGLang endpoint unreachable: {exc}") from exc
|
||||
except httpx.HTTPStatusError as exc:
|
||||
raise RuntimeError(
|
||||
f"NemoClaw SGLang HTTP {exc.response.status_code}: {exc.response.text[:300]}"
|
||||
) from exc
|
||||
except (KeyError, IndexError, TypeError, json.JSONDecodeError) as exc:
|
||||
raise RuntimeError(f"NemoClaw SGLang returned invalid JSON: {exc}") from exc
|
||||
|
||||
|
||||
async def score_qd(
|
||||
@@ -368,46 +323,32 @@ async def profile_cctv_visitor(
|
||||
|
||||
|
||||
async def health_check() -> dict:
|
||||
results: dict[str, str] = {}
|
||||
endpoints: list[tuple[str, str, str, dict[str, str]]] = []
|
||||
if NVIDIA_API_KEY:
|
||||
endpoints.append(
|
||||
(
|
||||
"nvidia_primary",
|
||||
NVIDIA_CHAT_URL,
|
||||
NVIDIA_MODEL,
|
||||
{
|
||||
"Authorization": f"Bearer {NVIDIA_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if SGLANG_API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {SGLANG_API_TOKEN}"
|
||||
|
||||
results: dict[str, str] = {
|
||||
"model": SGLANG_MODEL,
|
||||
"primary_url": SGLANG_CHAT_URL,
|
||||
"models_url": SGLANG_MODELS_URL,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
models_response = await client.get(SGLANG_MODELS_URL, headers=headers)
|
||||
models_response.raise_for_status()
|
||||
chat_response = await client.post(
|
||||
SGLANG_CHAT_URL,
|
||||
json={
|
||||
"model": SGLANG_MODEL,
|
||||
"messages": [{"role": "user", "content": "ping"}],
|
||||
"max_tokens": 5,
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
)
|
||||
if NEMOCLAW_CHAT_URL:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if NEMOCLAW_API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {NEMOCLAW_API_TOKEN}"
|
||||
endpoints.append(("compatible_endpoint", NEMOCLAW_CHAT_URL, NEMOCLAW_MODEL, headers))
|
||||
if ALLOW_LOCAL_FALLBACK:
|
||||
endpoints.append(
|
||||
("ollama_fallback", OLLAMA_CHAT_URL, OLLAMA_MODEL, {"Content-Type": "application/json"})
|
||||
)
|
||||
chat_response.raise_for_status()
|
||||
results["sglang"] = "ok"
|
||||
except Exception as exc:
|
||||
results["sglang"] = f"error: {exc}"
|
||||
|
||||
for name, url, model, headers in endpoints:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.post(
|
||||
url,
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": "ping"}],
|
||||
"max_tokens": 5,
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
results[name] = "ok" if response.status_code < 500 else f"http_{response.status_code}"
|
||||
except Exception as exc:
|
||||
results[name] = f"error: {exc}"
|
||||
|
||||
results["model"] = NVIDIA_MODEL if NVIDIA_API_KEY else NEMOCLAW_MODEL
|
||||
results["primary_url"] = NVIDIA_CHAT_URL if NVIDIA_API_KEY else (NEMOCLAW_CHAT_URL or OLLAMA_CHAT_URL)
|
||||
return results
|
||||
|
||||
@@ -13,15 +13,17 @@ import httpx
|
||||
|
||||
logger = logging.getLogger("velocity.runtime_llm")
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://127.0.0.1:11434").rstrip("/")
|
||||
OLLAMA_CHAT_URL = os.getenv("OLLAMA_CHAT_URL", f"{OLLAMA_BASE_URL}/v1/chat/completions")
|
||||
OLLAMA_TAGS_URL = os.getenv("OLLAMA_TAGS_URL", f"{OLLAMA_BASE_URL}/api/tags")
|
||||
OLLAMA_DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:27b")
|
||||
|
||||
NEMOCLAW_BASE_URL = os.getenv("NEMOCLAW_BASE_URL", "").rstrip("/")
|
||||
NEMOCLAW_CHAT_URL = (os.getenv("NEMOCLAW_CHAT_URL") or f"{NEMOCLAW_BASE_URL}/v1/chat/completions").rstrip("/") if NEMOCLAW_BASE_URL else ""
|
||||
NEMOCLAW_DEFAULT_MODEL = os.getenv("NEMOCLAW_MODEL", "nvidia/nemotron-3-super-120b-a12b")
|
||||
NEMOCLAW_API_TOKEN = os.getenv("NEMOCLAW_API_TOKEN", "")
|
||||
SGLANG_BASE_URL = os.getenv(
|
||||
"SGLANG_BASE_URL",
|
||||
os.getenv("LLM_BASE_URL", os.getenv("OLLAMA_BASE_URL", "https://llm.desineuron.in")),
|
||||
).rstrip("/")
|
||||
SGLANG_CHAT_URL = os.getenv("SGLANG_CHAT_URL", f"{SGLANG_BASE_URL}/v1/chat/completions")
|
||||
SGLANG_MODELS_URL = os.getenv("SGLANG_MODELS_URL", f"{SGLANG_BASE_URL}/v1/models")
|
||||
SGLANG_DEFAULT_MODEL = os.getenv(
|
||||
"SGLANG_MODEL",
|
||||
os.getenv("OLLAMA_MODEL", "qwen3.6:35b-a3b"),
|
||||
)
|
||||
SGLANG_API_TOKEN = os.getenv("SGLANG_API_TOKEN", "")
|
||||
|
||||
RUNTIME_LLM_TIMEOUT_S = float(os.getenv("RUNTIME_LLM_TIMEOUT_S", "90.0"))
|
||||
RUNTIME_LLM_CONCURRENCY = int(os.getenv("RUNTIME_LLM_BATCH_CONCURRENCY", "2"))
|
||||
@@ -57,40 +59,30 @@ class RuntimeLLMService:
|
||||
self._jobs: dict[str, dict[str, Any]] = {}
|
||||
|
||||
def _provider_catalog(self) -> list[RuntimeProvider]:
|
||||
providers: list[RuntimeProvider] = []
|
||||
if OLLAMA_CHAT_URL:
|
||||
providers.append(
|
||||
RuntimeProvider(
|
||||
provider_id="ollama",
|
||||
base_url=OLLAMA_BASE_URL,
|
||||
chat_url=OLLAMA_CHAT_URL,
|
||||
default_model=OLLAMA_DEFAULT_MODEL,
|
||||
)
|
||||
if not SGLANG_CHAT_URL:
|
||||
return []
|
||||
return [
|
||||
RuntimeProvider(
|
||||
provider_id="sglang",
|
||||
base_url=SGLANG_BASE_URL,
|
||||
chat_url=SGLANG_CHAT_URL,
|
||||
default_model=SGLANG_DEFAULT_MODEL,
|
||||
auth_token=SGLANG_API_TOKEN or None,
|
||||
)
|
||||
if NEMOCLAW_CHAT_URL:
|
||||
providers.append(
|
||||
RuntimeProvider(
|
||||
provider_id="nemoclaw",
|
||||
base_url=NEMOCLAW_BASE_URL,
|
||||
chat_url=NEMOCLAW_CHAT_URL,
|
||||
default_model=NEMOCLAW_DEFAULT_MODEL,
|
||||
auth_token=NEMOCLAW_API_TOKEN or None,
|
||||
)
|
||||
)
|
||||
return providers
|
||||
]
|
||||
|
||||
def get_provider(self, provider_id: str | None) -> RuntimeProvider:
|
||||
providers = {provider.provider_id: provider for provider in self._provider_catalog()}
|
||||
if provider_id in {"ollama", "nemoclaw"}:
|
||||
provider_id = "sglang"
|
||||
if provider_id:
|
||||
provider = providers.get(provider_id)
|
||||
if provider is None:
|
||||
raise ValueError(f"Unknown provider '{provider_id}'.")
|
||||
return provider
|
||||
|
||||
if "nemoclaw" in providers:
|
||||
return providers["nemoclaw"]
|
||||
if "ollama" in providers:
|
||||
return providers["ollama"]
|
||||
if "sglang" in providers:
|
||||
return providers["sglang"]
|
||||
raise ValueError("No runtime LLM providers are configured.")
|
||||
|
||||
async def list_providers(self) -> list[dict[str, Any]]:
|
||||
@@ -101,28 +93,18 @@ class RuntimeLLMService:
|
||||
error: str | None = None
|
||||
|
||||
try:
|
||||
if provider.provider_id == "ollama":
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.get(OLLAMA_TAGS_URL)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
models = [str(item.get("name", "")).strip() for item in payload.get("models", []) if item.get("name")]
|
||||
if provider.default_model not in models:
|
||||
models.insert(0, provider.default_model)
|
||||
status = "online"
|
||||
else:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(
|
||||
provider.chat_url,
|
||||
json={
|
||||
"model": provider.default_model,
|
||||
"messages": [{"role": "user", "content": "ping"}],
|
||||
"max_tokens": 4,
|
||||
},
|
||||
headers=provider.headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
status = "online"
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.get(SGLANG_MODELS_URL, headers=provider.headers)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
models = [
|
||||
str(item.get("id", "")).strip()
|
||||
for item in payload.get("data", [])
|
||||
if item.get("id")
|
||||
]
|
||||
if provider.default_model not in models:
|
||||
models.insert(0, provider.default_model)
|
||||
status = "online"
|
||||
except Exception as exc: # pragma: no cover - network/runtime dependent
|
||||
error = str(exc)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user