forked from sagnik/Project_Velocity
559 lines
21 KiB
Python
559 lines
21 KiB
Python
"""
|
|
Natural DB-first Oracle agent.
|
|
|
|
Pipeline:
|
|
1. schema introspection
|
|
2. semantic SQL planning
|
|
3. plan verification and optional repair
|
|
4. SQL execution
|
|
5. execution quality profiling and auto-replan
|
|
6. visualization planning from actual result shape
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime
|
|
from decimal import Decimal
|
|
from typing import Any
|
|
|
|
from backend.services.runtime_llm_service import runtime_llm_service
|
|
from .execution_profiler import execution_profiler
|
|
from .plan_verifier import plan_verifier
|
|
from .semantic_catalog import CATALOG_VERSION, build_semantic_context_for_planner
|
|
from .visualization_planner import VisualizationDecision, visualization_planner
|
|
|
|
try:
|
|
import asyncpg # type: ignore
|
|
except Exception: # pragma: no cover
|
|
asyncpg = None # type: ignore
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DESTRUCTIVE_SQL = re.compile(
|
|
r"\b(insert|update|delete|drop|alter|truncate|copy|create|grant|revoke|call|execute|do|merge)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE)
|
|
_MAX_REPLAN_ATTEMPTS = 2
|
|
|
|
|
|
def _json_safe(value: Any) -> Any:
|
|
if isinstance(value, (datetime, date)):
|
|
return value.isoformat()
|
|
if isinstance(value, Decimal):
|
|
return float(value)
|
|
if isinstance(value, (list, tuple)):
|
|
return [_json_safe(item) for item in value]
|
|
if isinstance(value, dict):
|
|
return {str(key): _json_safe(item) for key, item in value.items()}
|
|
return value
|
|
|
|
|
|
def db_ready() -> bool:
|
|
if asyncpg is None:
|
|
return False
|
|
read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "")
|
|
if read_database_url and not read_database_url.startswith("PLACEHOLDER"):
|
|
return True
|
|
database_url = os.getenv("DATABASE_URL", "")
|
|
return bool(database_url and not database_url.startswith("PLACEHOLDER")) or all(
|
|
os.getenv(name) for name in ("VELOCITY_DB_NAME", "VELOCITY_DB_USER", "VELOCITY_DB_PASSWORD")
|
|
)
|
|
|
|
|
|
async def connect_db() -> Any:
|
|
if asyncpg is None:
|
|
raise RuntimeError("asyncpg is not installed.")
|
|
|
|
read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "")
|
|
if read_database_url and not read_database_url.startswith("PLACEHOLDER"):
|
|
return await asyncpg.connect(read_database_url)
|
|
|
|
if all(os.getenv(name) for name in ("VELOCITY_DB_READ_NAME", "VELOCITY_DB_READ_USER", "VELOCITY_DB_READ_PASSWORD")):
|
|
return await asyncpg.connect(
|
|
host=os.getenv("VELOCITY_DB_READ_HOST", os.getenv("VELOCITY_DB_HOST", "127.0.0.1")),
|
|
port=int(os.getenv("VELOCITY_DB_READ_PORT", os.getenv("VELOCITY_DB_PORT", "5432"))),
|
|
database=os.environ["VELOCITY_DB_READ_NAME"],
|
|
user=os.environ["VELOCITY_DB_READ_USER"],
|
|
password=os.environ["VELOCITY_DB_READ_PASSWORD"],
|
|
)
|
|
|
|
database_url = os.getenv("DATABASE_URL", "")
|
|
if database_url and not database_url.startswith("PLACEHOLDER"):
|
|
return await asyncpg.connect(database_url)
|
|
|
|
return await asyncpg.connect(
|
|
host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"),
|
|
port=int(os.getenv("VELOCITY_DB_PORT", "5432")),
|
|
database=os.environ["VELOCITY_DB_NAME"],
|
|
user=os.environ["VELOCITY_DB_USER"],
|
|
password=os.environ["VELOCITY_DB_PASSWORD"],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class NaturalQueryResult:
|
|
prompt: str
|
|
sql: str
|
|
title: str
|
|
summary: str
|
|
columns: list[str]
|
|
rows: list[dict[str, Any]]
|
|
row_count: int
|
|
source_tables: list[str]
|
|
component_type: str
|
|
warnings: list[str]
|
|
visualization_decision: VisualizationDecision | None = None
|
|
replan_count: int = 0
|
|
semantic_catalog_version: str = CATALOG_VERSION
|
|
|
|
def as_dict(self) -> dict[str, Any]:
|
|
decision = self.visualization_decision
|
|
return {
|
|
"prompt": self.prompt,
|
|
"sql": self.sql,
|
|
"title": self.title,
|
|
"summary": self.summary,
|
|
"columns": self.columns,
|
|
"rows": self.rows,
|
|
"rowCount": self.row_count,
|
|
"sourceTables": self.source_tables,
|
|
"componentType": self.component_type,
|
|
"warnings": self.warnings,
|
|
"semanticCatalogVersion": self.semantic_catalog_version,
|
|
"replanCount": self.replan_count,
|
|
"visualizationDecision": {
|
|
"xAxis": decision.x_axis,
|
|
"yAxis": decision.y_axis,
|
|
"dimensionCols": decision.dimension_cols,
|
|
"measureCols": decision.measure_cols,
|
|
"widthMode": decision.width_mode,
|
|
"minHeightPx": decision.min_height_px,
|
|
"skeletonVariant": decision.skeleton_variant,
|
|
"vizParams": decision.viz_params,
|
|
"dataBindings": decision.data_bindings,
|
|
"confidence": decision.confidence,
|
|
"reasoning": decision.reasoning,
|
|
}
|
|
if decision
|
|
else {},
|
|
}
|
|
|
|
|
|
def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]:
|
|
warnings: list[str] = []
|
|
clean = re.sub(r"--.*?$|/\*.*?\*/", "", sql.strip(), flags=re.MULTILINE | re.DOTALL).strip().rstrip(";")
|
|
if not re.match(r"^(select|with)\b", clean, re.IGNORECASE):
|
|
raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.")
|
|
if DESTRUCTIVE_SQL.search(clean):
|
|
raise ValueError("Oracle SQL agent blocked non-read SQL.")
|
|
|
|
tables: list[str] = []
|
|
for match in TABLE_REF_RE.finditer(clean):
|
|
table = match.group(1).split(".")[-1].strip('"').lower()
|
|
if table in {"lateral", "select"}:
|
|
continue
|
|
if table and table not in tables:
|
|
tables.append(table)
|
|
|
|
if "limit" not in clean.lower():
|
|
clean += f" LIMIT {row_limit}"
|
|
warnings.append(f"Row cap {row_limit} auto-applied (query had no LIMIT).")
|
|
return clean, tables, warnings
|
|
|
|
|
|
def _detect_intents(prompt: str) -> list[str]:
|
|
lowered = prompt.lower()
|
|
intents: list[str] = []
|
|
|
|
if any(token in lowered for token in (
|
|
"last contact", "last contacted", "recently contacted", "last call",
|
|
"last message", "last whatsapp", "contacted us", "follow-up", "follow up",
|
|
"days since", "no contact",
|
|
)):
|
|
intents.append("last_contacted")
|
|
|
|
if any(token in lowered for token in (
|
|
"interested in", "shown interest", "interest in", "interested clients",
|
|
"project interest", "property interest",
|
|
)):
|
|
intents.append("interested_clients")
|
|
|
|
if any(token in lowered for token in ("qd score", "qualification score", "desire score", "intent score", "qd")):
|
|
intents.append("qd_score")
|
|
|
|
if any(token in lowered for token in ("pipeline", "stage", "funnel", "kanban", "deal")):
|
|
intents.append("pipeline")
|
|
|
|
if any(token in lowered for token in ("site visit", "visited", "visit")):
|
|
intents.append("site_visits")
|
|
|
|
if any(token in lowered for token in ("call", "transcript", "whatsapp", "email", "message", "conversation", "interaction", "timeline", "activity")):
|
|
intents.append("timeline")
|
|
|
|
if any(token in lowered for token in ("objection", "concern", "complaint", "pushback")):
|
|
intents.append("objections")
|
|
|
|
if any(token in lowered for token in ("broker", "agent performance", "referral")):
|
|
intents.append("broker_performance")
|
|
|
|
if any(token in lowered for token in ("next action", "next step", "what should i do", "follow-up priority", "action queue")):
|
|
intents.append("next_action")
|
|
|
|
if any(token in lowered for token in ("project", "unit", "inventory", "available", "price", "configuration")):
|
|
intents.append("inventory")
|
|
|
|
if any(token in lowered for token in ("client 360", "dossier", "profile")):
|
|
intents.append("client_360")
|
|
|
|
if any(token in lowered for token in ("fact", "memory", "promise", "commitment", "budget", "preference")):
|
|
intents.append("extracted_facts")
|
|
|
|
return intents or ["last_contacted"]
|
|
|
|
|
|
def title_from_prompt(prompt: str) -> str:
|
|
words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!")
|
|
return (words[:1].upper() + words[1:80]) if words else "Oracle Query Result"
|
|
|
|
|
|
class NaturalDbAgent:
|
|
async def schema_catalog(self, conn: Any | None = None) -> dict[str, Any]:
|
|
own_conn = conn is None
|
|
if conn is None:
|
|
if not db_ready():
|
|
return {"tables": [], "available": False}
|
|
conn = await connect_db()
|
|
try:
|
|
table_names = await conn.fetch(
|
|
"""
|
|
SELECT table_name
|
|
FROM information_schema.tables
|
|
WHERE table_schema = 'public' AND table_type = 'BASE TABLE'
|
|
ORDER BY table_name
|
|
"""
|
|
)
|
|
public_tables = [row["table_name"] for row in table_names]
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT c.table_name, c.column_name, c.data_type, c.udt_name, c.is_nullable
|
|
FROM information_schema.columns c
|
|
WHERE c.table_schema = 'public'
|
|
ORDER BY c.table_name, c.ordinal_position
|
|
"""
|
|
)
|
|
counts: dict[str, int | None] = {}
|
|
for table in public_tables:
|
|
exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}")
|
|
counts[table] = None if not exists else int(await conn.fetchval(f'SELECT COUNT(*) FROM "{table}"'))
|
|
|
|
tables: dict[str, dict[str, Any]] = {}
|
|
for row in rows:
|
|
entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])})
|
|
entry["columns"].append(
|
|
{
|
|
"name": row["column_name"],
|
|
"dataType": row["data_type"],
|
|
"udtName": row["udt_name"],
|
|
"nullable": row["is_nullable"] == "YES",
|
|
}
|
|
)
|
|
return {"available": True, "tables": tables, "allowedTables": public_tables}
|
|
finally:
|
|
if own_conn:
|
|
await conn.close()
|
|
|
|
async def data_health(self, conn: Any | None = None) -> dict[str, Any]:
|
|
catalog = await self.schema_catalog(conn)
|
|
expected = {
|
|
"crm_people": 341,
|
|
"crm_leads": 250,
|
|
"crm_opportunities": 400,
|
|
"crm_property_interests": 400,
|
|
"intel_interactions": 1897,
|
|
"intel_messages": 6944,
|
|
"intel_calls": 478,
|
|
"intel_transcripts": 231,
|
|
"intel_emails": 149,
|
|
"intel_visits": 305,
|
|
"intel_reminders": 759,
|
|
"intel_extracted_facts": 1686,
|
|
"read_last_contacted": 250,
|
|
"read_next_best_action": 250,
|
|
}
|
|
tables = catalog.get("tables", {})
|
|
counts = {table: (meta or {}).get("rowCount") for table, meta in sorted(tables.items())}
|
|
return {
|
|
"counts": counts,
|
|
"expectedSyntheticV2Counts": expected,
|
|
"missingTables": [table for table, count in counts.items() if count is None],
|
|
"emptyTables": [table for table, count in counts.items() if count == 0],
|
|
"belowExpected": {
|
|
table: {"expected": expected_count, "actual": counts.get(table)}
|
|
for table, expected_count in expected.items()
|
|
if (counts.get(table) or 0) < expected_count
|
|
},
|
|
}
|
|
|
|
async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult:
|
|
if not prompt.strip():
|
|
raise ValueError("Prompt is required.")
|
|
|
|
own_conn = conn is None
|
|
if conn is None:
|
|
if not db_ready():
|
|
raise RuntimeError("Database unavailable for Oracle natural query.")
|
|
conn = await connect_db()
|
|
try:
|
|
catalog = await self.schema_catalog(conn)
|
|
detected_intents = _detect_intents(prompt)
|
|
return await self._pipeline(
|
|
conn=conn,
|
|
prompt=prompt,
|
|
catalog=catalog,
|
|
detected_intents=detected_intents,
|
|
row_limit=row_limit,
|
|
attempt=0,
|
|
prior_feedback=None,
|
|
)
|
|
finally:
|
|
if own_conn:
|
|
await conn.close()
|
|
|
|
async def _pipeline(
|
|
self,
|
|
*,
|
|
conn: Any,
|
|
prompt: str,
|
|
catalog: dict[str, Any],
|
|
detected_intents: list[str],
|
|
row_limit: int,
|
|
attempt: int,
|
|
prior_feedback: str | None,
|
|
) -> NaturalQueryResult:
|
|
warnings: list[str] = []
|
|
|
|
plan = await self._plan_sql(
|
|
prompt=prompt,
|
|
catalog=catalog,
|
|
detected_intents=detected_intents,
|
|
row_limit=row_limit,
|
|
prior_feedback=prior_feedback,
|
|
)
|
|
raw_sql = str(plan.get("sql") or "").strip()
|
|
if not raw_sql:
|
|
raise RuntimeError("Natural SQL planner returned no SQL.")
|
|
|
|
verification = await plan_verifier.verify_and_repair(
|
|
sql=raw_sql,
|
|
prompt=prompt,
|
|
detected_intents=detected_intents,
|
|
row_limit=row_limit,
|
|
llm_service=runtime_llm_service,
|
|
)
|
|
if verification.was_repaired:
|
|
warnings.append(
|
|
"Plan verifier repaired violations: "
|
|
+ ", ".join(violation.rule for violation in verification.violations if violation.severity == "blocking")
|
|
)
|
|
if not verification.passed and verification.repair_failed:
|
|
warnings.append("Plan verifier found violations but repair failed. Proceeding with original SQL.")
|
|
if verification.notes:
|
|
warnings.extend(verification.notes)
|
|
|
|
effective_sql, source_tables, sanitize_warnings = sanitize_sql(verification.sql, row_limit)
|
|
warnings.extend(sanitize_warnings)
|
|
|
|
try:
|
|
records = await conn.fetch(effective_sql)
|
|
except Exception as exc:
|
|
raise RuntimeError(f"Natural SQL execution failed: {exc}") from exc
|
|
|
|
rows = [_json_safe(dict(record)) for record in records]
|
|
columns = list(rows[0].keys()) if rows else []
|
|
|
|
profile = execution_profiler.profile(
|
|
rows=rows,
|
|
columns=columns,
|
|
sql=effective_sql,
|
|
prompt=prompt,
|
|
source_tables=source_tables,
|
|
row_limit=row_limit,
|
|
)
|
|
|
|
if not profile.passed and attempt < _MAX_REPLAN_ATTEMPTS:
|
|
feedback = " | ".join(profile.replan_hints)
|
|
warnings.append(f"Auto-replan triggered (attempt {attempt + 1}): {feedback[:160]}")
|
|
return await self._pipeline(
|
|
conn=conn,
|
|
prompt=prompt,
|
|
catalog=catalog,
|
|
detected_intents=detected_intents,
|
|
row_limit=row_limit,
|
|
attempt=attempt + 1,
|
|
prior_feedback=feedback,
|
|
)
|
|
|
|
if not profile.passed:
|
|
for issue in profile.issues:
|
|
if issue.severity == "blocking":
|
|
warnings.append(f"Quality issue after {attempt} replans: [{issue.code}] {issue.description}")
|
|
|
|
visualization_decision = visualization_planner.plan(
|
|
rows=rows,
|
|
columns=columns,
|
|
prompt=prompt,
|
|
source_tables=source_tables,
|
|
profile_suggested_type=profile.suggested_component_type,
|
|
title_from_planner=str(plan.get("title") or ""),
|
|
)
|
|
|
|
title = visualization_decision.title or str(plan.get("title") or title_from_prompt(prompt))
|
|
summary = str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(source_tables) or 'Velocity CRM'}.")
|
|
|
|
return NaturalQueryResult(
|
|
prompt=prompt,
|
|
sql=effective_sql,
|
|
title=title,
|
|
summary=summary,
|
|
columns=columns,
|
|
rows=rows,
|
|
row_count=len(rows),
|
|
source_tables=source_tables,
|
|
component_type=visualization_decision.component_type,
|
|
warnings=warnings,
|
|
visualization_decision=visualization_decision,
|
|
replan_count=attempt,
|
|
semantic_catalog_version=CATALOG_VERSION,
|
|
)
|
|
|
|
async def _plan_sql(
|
|
self,
|
|
*,
|
|
prompt: str,
|
|
catalog: dict[str, Any],
|
|
detected_intents: list[str],
|
|
row_limit: int,
|
|
prior_feedback: str | None = None,
|
|
) -> dict[str, Any]:
|
|
try:
|
|
providers = runtime_llm_service._provider_catalog()
|
|
except Exception:
|
|
providers = {}
|
|
if not providers:
|
|
raise RuntimeError("No runtime LLM providers configured for Oracle natural planning.")
|
|
|
|
schema_full = catalog.get("tables", {})
|
|
relevant_tables = self._relevant_tables_for_intents(detected_intents)
|
|
schema_brief_dict = {
|
|
table: meta
|
|
for table, meta in schema_full.items()
|
|
if table in relevant_tables or table in {"crm_people", "crm_leads", "inventory_projects", "inventory_units"}
|
|
}
|
|
schema_brief = json.dumps(schema_brief_dict, default=str)[:14000]
|
|
semantic_context = build_semantic_context_for_planner(detected_intents, max_concepts=5)
|
|
|
|
replan_section = ""
|
|
if prior_feedback:
|
|
replan_section = (
|
|
f"\n\nPREVIOUS ATTEMPT FAILED - EXECUTION FEEDBACK:\n{prior_feedback}\n"
|
|
"You must address the feedback and change the query accordingly."
|
|
)
|
|
|
|
response = await runtime_llm_service.chat(
|
|
provider_id="sglang",
|
|
model=None,
|
|
system_prompt=(
|
|
"You are Oracle's read-only PostgreSQL planner for Project Velocity CRM. "
|
|
"Use the semantic catalog as the business source of truth, not raw column guessing. "
|
|
"Generate exactly one SELECT or WITH query. "
|
|
"Return strict JSON with keys: sql, title, rationale. "
|
|
"Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements."
|
|
),
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": (
|
|
f"SEMANTIC CATALOG:\n{semantic_context}\n\n"
|
|
f"RAW SCHEMA:\n{schema_brief}\n\n"
|
|
f"DETECTED INTENTS: {', '.join(detected_intents)}\n\n"
|
|
f"USER QUESTION:\n{prompt}\n\n"
|
|
f"ROW CAP: {row_limit}\n"
|
|
f"{replan_section}\n\n"
|
|
"Return strict JSON: {\"sql\": \"...\", \"title\": \"...\", \"rationale\": \"...\"}"
|
|
),
|
|
}
|
|
],
|
|
temperature=0.05,
|
|
response_format="json",
|
|
metadata={
|
|
"agent": "oracle_natural_db_agent_v2",
|
|
"intents": detected_intents,
|
|
"catalog_version": CATALOG_VERSION,
|
|
},
|
|
)
|
|
message = response.get("message") or {}
|
|
parsed = message.get("parsedJson")
|
|
content = message.get("content") or "{}"
|
|
if not isinstance(parsed, dict):
|
|
parsed = json.loads(content) if isinstance(content, str) else content
|
|
if isinstance(parsed, dict) and parsed.get("sql"):
|
|
return parsed
|
|
raise RuntimeError("Natural DB planner returned no valid SQL.")
|
|
|
|
@staticmethod
|
|
def _relevant_tables_for_intents(intents: list[str]) -> set[str]:
|
|
intent_tables: dict[str, set[str]] = {
|
|
"last_contacted": {
|
|
"intel_interactions",
|
|
"crm_people",
|
|
"crm_leads",
|
|
"read_last_contacted",
|
|
"crm_last_contact_read_model",
|
|
},
|
|
"interested_clients": {
|
|
"crm_property_interests",
|
|
"crm_people",
|
|
"inventory_projects",
|
|
"intel_qd_scores",
|
|
},
|
|
"qd_score": {"intel_qd_scores", "crm_people"},
|
|
"pipeline": {"crm_opportunities", "crm_leads", "crm_people", "inventory_projects"},
|
|
"site_visits": {"intel_visits", "crm_people", "inventory_projects"},
|
|
"timeline": {
|
|
"intel_interactions",
|
|
"intel_calls",
|
|
"intel_whatsapp_threads",
|
|
"intel_messages",
|
|
"intel_emails",
|
|
"intel_visits",
|
|
"crm_people",
|
|
},
|
|
"objections": {"intel_call_objections", "crm_people", "inventory_projects"},
|
|
"broker_performance": {"crm_leads", "crm_opportunities", "crm_people"},
|
|
"next_action": {"read_next_best_action", "crm_people"},
|
|
"inventory": {"inventory_projects", "inventory_units", "crm_property_interests"},
|
|
"client_360": {
|
|
"crm_people",
|
|
"crm_leads",
|
|
"intel_qd_scores",
|
|
"crm_property_interests",
|
|
"crm_opportunities",
|
|
"intel_interactions",
|
|
"read_last_contacted",
|
|
"read_next_best_action",
|
|
},
|
|
"extracted_facts": {"intel_extracted_facts", "crm_people"},
|
|
}
|
|
tables: set[str] = set()
|
|
for intent in intents:
|
|
tables.update(intent_tables.get(intent, set()))
|
|
return tables
|
|
|
|
|
|
natural_db_agent = NaturalDbAgent()
|