Files
Project_Velocity/backend/oracle/natural_db_agent.py

469 lines
22 KiB
Python

"""
Natural DB-first Oracle agent.
The LLM can plan arbitrary analytical SELECT statements over the Velocity CRM,
intel, inventory, and read-model tables. The executor enforces a read-only SQL
contract and a UI row cap; write paths stay behind typed API endpoints.
"""
from __future__ import annotations
import json
import logging
import os
import re
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any
from backend.services.runtime_llm_service import runtime_llm_service
try:
import asyncpg # type: ignore
except Exception: # pragma: no cover
asyncpg = None # type: ignore
logger = logging.getLogger(__name__)
MAX_ROW_CAP = 500
ALLOWED_TABLES = {
"crm_people", "crm_leads", "crm_accounts", "crm_households", "crm_relationships",
"crm_opportunities", "crm_property_interests", "crm_stage_history",
"intel_interactions", "intel_messages", "intel_calls", "intel_transcripts",
"intel_emails", "intel_email_threads", "intel_whatsapp_threads", "intel_visits",
"intel_reminders", "intel_qd_scores", "intel_qd_timeseries",
"intel_extracted_facts", "intel_call_objections", "intel_cctv_links",
"intel_perception_events", "intel_vehicle_events",
"inventory_projects", "inventory_units",
"read_last_contacted", "read_next_best_action",
}
DESTRUCTIVE_SQL = re.compile(
r"\b(insert|update|delete|drop|alter|truncate|copy|create|grant|revoke|call|execute|do|merge)\b",
re.IGNORECASE,
)
TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE)
def _json_safe(value: Any) -> Any:
if isinstance(value, (datetime, date)):
return value.isoformat()
if isinstance(value, Decimal):
return float(value)
if isinstance(value, (list, tuple)):
return [_json_safe(v) for v in value]
if isinstance(value, dict):
return {str(k): _json_safe(v) for k, v in value.items()}
return value
def db_ready() -> bool:
if asyncpg is None:
return False
database_url = os.getenv("DATABASE_URL", "")
return bool(database_url and not database_url.startswith("PLACEHOLDER")) or all(
os.getenv(name) for name in ("VELOCITY_DB_NAME", "VELOCITY_DB_USER", "VELOCITY_DB_PASSWORD")
)
async def connect_db() -> Any:
if asyncpg is None:
raise RuntimeError("asyncpg is not installed.")
database_url = os.getenv("DATABASE_URL", "")
if database_url and not database_url.startswith("PLACEHOLDER"):
return await asyncpg.connect(database_url)
return await asyncpg.connect(
host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"),
port=int(os.getenv("VELOCITY_DB_PORT", "5432")),
database=os.environ["VELOCITY_DB_NAME"],
user=os.environ["VELOCITY_DB_USER"],
password=os.environ["VELOCITY_DB_PASSWORD"],
)
@dataclass
class NaturalQueryResult:
prompt: str
sql: str
title: str
summary: str
columns: list[str]
rows: list[dict[str, Any]]
row_count: int
source_tables: list[str]
component_type: str
warnings: list[str]
def as_dict(self) -> dict[str, Any]:
return {
"prompt": self.prompt,
"sql": self.sql,
"title": self.title,
"summary": self.summary,
"columns": self.columns,
"rows": self.rows,
"rowCount": self.row_count,
"sourceTables": self.source_tables,
"componentType": self.component_type,
"warnings": self.warnings,
}
def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]:
warnings: list[str] = []
clean = re.sub(r"--.*?$|/\*.*?\*/", "", sql.strip(), flags=re.MULTILINE | re.DOTALL).strip().rstrip(";")
if not re.match(r"^(select|with)\b", clean, re.IGNORECASE):
raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.")
if DESTRUCTIVE_SQL.search(clean):
raise ValueError("Oracle SQL agent blocked non-read SQL.")
tables = []
for match in TABLE_REF_RE.finditer(clean):
table = match.group(1).split(".")[-1].strip('"').lower()
if table in {"lateral", "select"}:
continue
if table and table not in tables:
tables.append(table)
blocked = [table for table in tables if table not in ALLOWED_TABLES]
if blocked:
raise ValueError(f"Oracle SQL agent blocked unknown tables: {', '.join(blocked)}")
capped = max(1, min(int(row_limit or 100), MAX_ROW_CAP))
if not re.search(r"\blimit\s+\d+\b", clean, re.IGNORECASE):
clean = f"SELECT * FROM ({clean}) oracle_limited_rows LIMIT {capped}"
warnings.append(f"Applied UI row cap LIMIT {capped}.")
return clean, tables, warnings
def infer_component_type(prompt: str, columns: list[str], rows: list[dict[str, Any]]) -> str:
lower = prompt.lower()
if any(term in lower for term in ("timeline", "conversation", "whatsapp", "message", "call", "email", "history")):
return "activity_stream"
if len(rows) == 1 and len(columns) <= 5 and any(isinstance(rows[0].get(c), (int, float)) for c in columns):
return "kpi_tile"
if any(c.endswith("_at") or c in {"date", "when", "timestamp", "happened_at"} for c in columns):
if len(rows) > 1 and any(term in lower for term in ("trend", "over time", "timeseries")):
return "line_chart"
if any(term in lower for term in ("timeline", "activity", "last", "recent")):
return "activity_stream"
numeric_cols = [c for c in columns if rows and isinstance(rows[0].get(c), (int, float))]
if numeric_cols and any(term in lower for term in ("count", "compare", "distribution", "most", "top", "by ")):
return "bar_chart"
return "table"
def title_from_prompt(prompt: str) -> str:
words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!")
return words[:1].upper() + words[1:80] if words else "Oracle Query Result"
class NaturalDbAgent:
async def schema_catalog(self, conn: Any | None = None) -> dict[str, Any]:
own_conn = conn is None
if conn is None:
if not db_ready():
return {"tables": [], "available": False}
conn = await connect_db()
try:
rows = await conn.fetch(
"""
SELECT c.table_name, c.column_name, c.data_type, c.udt_name, c.is_nullable
FROM information_schema.columns c
WHERE c.table_schema = 'public' AND c.table_name = ANY($1::text[])
ORDER BY c.table_name, c.ordinal_position
""",
sorted(ALLOWED_TABLES),
)
counts = {}
for table in sorted(ALLOWED_TABLES):
exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}")
counts[table] = None if not exists else int(await conn.fetchval(f"SELECT COUNT(*) FROM {table}"))
tables: dict[str, dict[str, Any]] = {}
for row in rows:
entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])})
entry["columns"].append({
"name": row["column_name"],
"dataType": row["data_type"],
"udtName": row["udt_name"],
"nullable": row["is_nullable"] == "YES",
})
return {"available": True, "tables": tables, "allowedTables": sorted(ALLOWED_TABLES)}
finally:
if own_conn:
await conn.close()
async def data_health(self, conn: Any | None = None) -> dict[str, Any]:
catalog = await self.schema_catalog(conn)
expected = {
"crm_people": 341,
"crm_leads": 250,
"crm_opportunities": 400,
"crm_property_interests": 400,
"intel_interactions": 1897,
"intel_messages": 6944,
"intel_calls": 478,
"intel_transcripts": 231,
"intel_emails": 149,
"intel_visits": 305,
"intel_reminders": 759,
"intel_extracted_facts": 1686,
"read_last_contacted": 250,
"read_next_best_action": 250,
}
tables = catalog.get("tables", {})
counts = {table: (tables.get(table) or {}).get("rowCount") for table in sorted(ALLOWED_TABLES)}
return {
"counts": counts,
"expectedSyntheticV2Counts": expected,
"missingTables": [t for t, count in counts.items() if count is None],
"emptyTables": [t for t, count in counts.items() if count == 0],
"belowExpected": {t: {"expected": e, "actual": counts.get(t)} for t, e in expected.items() if (counts.get(t) or 0) < e},
}
async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult:
if not prompt.strip():
raise ValueError("Prompt is required.")
own_conn = conn is None
if conn is None:
if not db_ready():
raise RuntimeError("Database unavailable for Oracle natural query.")
conn = await connect_db()
try:
catalog = await self.schema_catalog(conn)
plan = await self._plan_sql(prompt, catalog, row_limit)
return await self._run_plan(conn, prompt, plan, row_limit)
finally:
if own_conn:
await conn.close()
async def _run_plan(self, conn: Any, prompt: str, plan: dict[str, Any], row_limit: int) -> NaturalQueryResult:
raw_sql = str(plan.get("sql") or "").strip()
if not raw_sql:
raw_sql = self._fallback_sql(prompt, row_limit)
sql, tables, warnings = sanitize_sql(raw_sql, row_limit)
try:
records = await conn.fetch(sql)
except Exception as exc:
retry = await self._repair_sql(prompt, raw_sql, str(exc), row_limit)
sql, tables, retry_warnings = sanitize_sql(retry, row_limit)
warnings.extend(retry_warnings)
warnings.append(f"Initial SQL repaired after database error: {exc}")
records = await conn.fetch(sql)
if not records:
retry_sql = self._zero_row_retry_sql(prompt, row_limit, raw_sql)
if retry_sql and retry_sql.strip() != raw_sql.strip():
retry_clean, retry_tables, retry_warnings = sanitize_sql(retry_sql, row_limit)
retry_records = await conn.fetch(retry_clean)
if retry_records:
sql = retry_clean
tables = retry_tables
records = retry_records
warnings.extend(retry_warnings)
warnings.append("Initial SQL returned zero rows; Oracle retried with a broader CRM read query.")
rows = [_json_safe(dict(record)) for record in records]
columns = list(rows[0].keys()) if rows else []
component_type = infer_component_type(prompt, columns, rows)
return NaturalQueryResult(
prompt=prompt,
sql=sql,
title=str(plan.get("title") or title_from_prompt(prompt)),
summary=str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(tables) or 'Velocity CRM'}."),
columns=columns,
rows=rows,
row_count=len(rows),
source_tables=tables,
component_type=component_type,
warnings=warnings,
)
async def _plan_sql(self, prompt: str, catalog: dict[str, Any], row_limit: int) -> dict[str, Any]:
fallback = {"sql": self._fallback_sql(prompt, row_limit), "title": title_from_prompt(prompt), "rationale": "Deterministic SQL planner fallback."}
try:
providers = runtime_llm_service._provider_catalog()
except Exception:
providers = {}
if not providers:
return fallback
schema_brief = json.dumps(catalog.get("tables", {}), default=str)[:16000]
system = (
"You are Oracle's read-only PostgreSQL planner. Generate one useful SELECT or WITH query "
"for the user's CRM question. Use only the provided schema. Return JSON with sql, title, rationale. "
"Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements."
)
try:
response = await runtime_llm_service.chat(
provider_id="sglang",
model=None,
system_prompt=system,
messages=[{"role": "user", "content": f"Schema:\n{schema_brief}\n\nQuestion:\n{prompt}\n\nRow cap: {row_limit}"}],
temperature=0.05,
response_format="json",
metadata={"agent": "oracle_natural_db_agent"},
)
message = response.get("message") or {}
parsed = message.get("parsedJson")
content = message.get("content") or "{}"
if not isinstance(parsed, dict):
parsed = json.loads(content) if isinstance(content, str) else content
if isinstance(parsed, dict) and parsed.get("sql"):
return parsed
except Exception as exc:
logger.warning("Natural DB planner LLM failed, using fallback: %s", exc)
return fallback
async def _repair_sql(self, prompt: str, failed_sql: str, error: str, row_limit: int) -> str:
# Keep retry operationally deterministic if model is unavailable.
if "read_last_contacted" in failed_sql and "does not exist" in error.lower():
return self._base_last_contacted_sql(row_limit)
if "read_next_best_action" in failed_sql and "does not exist" in error.lower():
return self._base_last_contacted_sql(row_limit)
return self._fallback_sql(prompt, row_limit)
def _zero_row_retry_sql(self, prompt: str, row_limit: int, previous_sql: str) -> str | None:
lower = prompt.lower()
if any(term in lower for term in ("contact", "recent", "last", "call", "message", "email", "whatsapp", "follow")):
return self._base_last_contacted_sql(row_limit)
if any(term in lower for term in ("interest", "interested", "property", "project", "unit", "budget", "bhk")):
return self._base_property_interest_sql(row_limit)
if "from crm_people" not in previous_sql.lower():
return self._generic_clients_sql(row_limit)
return None
def _base_last_contacted_sql(self, row_limit: int) -> str:
limit = max(1, min(row_limit, MAX_ROW_CAP))
return f"""
WITH contact_events AS (
SELECT i.person_id, i.happened_at AS event_at, i.channel::text AS channel,
i.interaction_type AS event_type, i.summary AS summary, i.broker_name AS actor
FROM intel_interactions i
WHERE i.happened_at IS NOT NULL
UNION ALL
SELECT i.person_id, m.delivered_at, 'message', COALESCE(m.sender_role, 'message'), m.message_text, m.sender_name
FROM intel_messages m
JOIN intel_interactions i ON i.interaction_id = m.interaction_id
WHERE m.delivered_at IS NOT NULL
UNION ALL
SELECT i.person_id, e.sent_at, 'email', COALESCE(e.direction::text, 'email'), e.subject, e.from_address
FROM intel_emails e
JOIN intel_interactions i ON i.interaction_id = e.interaction_id
WHERE e.sent_at IS NOT NULL
UNION ALL
SELECT v.person_id, v.visited_at, 'site_visit', 'visit', v.outcome, v.hosted_by
FROM intel_visits v
WHERE v.visited_at IS NOT NULL
),
ranked AS (
SELECT *, row_number() OVER (PARTITION BY person_id ORDER BY event_at DESC) AS rn,
count(*) OVER (PARTITION BY person_id) AS interaction_count
FROM contact_events
)
SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone,
p.primary_email AS email, r.event_at AS last_contacted_at,
r.channel AS last_contact_channel, r.event_type AS last_interaction_type,
r.summary AS last_contact_summary, r.actor AS last_contact_actor,
r.interaction_count::int,
q.current_value AS qd_score
FROM ranked r
JOIN crm_people p ON p.person_id = r.person_id
LEFT JOIN LATERAL (
SELECT current_value FROM intel_qd_scores q
WHERE q.person_id = p.person_id
ORDER BY q.current_value DESC, q.computed_at DESC
LIMIT 1
) q ON TRUE
WHERE r.rn = 1
ORDER BY r.event_at DESC
LIMIT {limit}
"""
def _base_property_interest_sql(self, row_limit: int) -> str:
limit = max(1, min(row_limit, MAX_ROW_CAP))
return f"""
SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone, p.primary_email AS email,
COUNT(pi.interest_id)::int AS interest_count,
string_agg(DISTINCT COALESCE(pi.project_name, pr.project_name), ', ') AS projects,
string_agg(DISTINCT pi.configuration, ', ') AS configurations,
MIN(pi.budget_min) AS budget_min, MAX(pi.budget_max) AS budget_max,
MAX(pi.last_discussed_at) AS last_interest_at,
MAX(q.current_value) AS qd_score
FROM crm_people p
JOIN crm_property_interests pi ON pi.person_id = p.person_id
LEFT JOIN inventory_projects pr ON pr.project_id = pi.project_id
LEFT JOIN intel_qd_scores q ON q.person_id = p.person_id
GROUP BY p.person_id, p.full_name, p.primary_phone, p.primary_email
HAVING COUNT(pi.interest_id) > 0
ORDER BY interest_count DESC, qd_score DESC NULLS LAST, last_interest_at DESC NULLS LAST
LIMIT {limit}
"""
def _generic_clients_sql(self, row_limit: int) -> str:
limit = max(1, min(row_limit, MAX_ROW_CAP))
return f"""
SELECT p.person_id::text, p.full_name AS name, p.primary_email AS email, p.primary_phone AS phone,
p.buyer_type, l.status::text AS lead_status, l.budget_band, l.urgency,
q.current_value AS qd_score
FROM crm_people p
LEFT JOIN LATERAL (
SELECT * FROM crm_leads l WHERE l.person_id = p.person_id ORDER BY l.updated_at DESC LIMIT 1
) l ON TRUE
LEFT JOIN LATERAL (
SELECT current_value FROM intel_qd_scores q
WHERE q.person_id = p.person_id
ORDER BY q.current_value DESC, q.computed_at DESC
LIMIT 1
) q ON TRUE
ORDER BY qd_score DESC NULLS LAST, p.full_name ASC
LIMIT {limit}
"""
def _fallback_sql(self, prompt: str, row_limit: int) -> str:
lower = prompt.lower()
limit = max(1, min(row_limit, MAX_ROW_CAP))
if "objection" in lower:
return f"""
SELECT p.person_id::text, p.full_name AS name, co.objection_type, co.category, co.severity,
co.client_quote, co.agent_response, co.extracted_at
FROM intel_call_objections co
JOIN intel_calls c ON c.call_id = co.call_id
JOIN intel_interactions i ON i.interaction_id = c.interaction_id
JOIN crm_people p ON p.person_id = i.person_id
ORDER BY co.extracted_at DESC
LIMIT {limit}
"""
if "whatsapp" in lower or "message" in lower or "conversation" in lower:
return f"""
SELECT p.person_id::text, p.full_name AS name, 'whatsapp' AS type,
m.message_text AS summary, m.sender_role AS actor, m.delivered_at AS date
FROM intel_messages m
JOIN intel_interactions i ON i.interaction_id = m.interaction_id
JOIN crm_people p ON p.person_id = i.person_id
WHERE lower(m.message_text) LIKE '%' || lower(split_part($${prompt}$$, ' ', 1)) || '%'
OR i.channel = 'whatsapp'
ORDER BY m.delivered_at DESC
LIMIT {limit}
"""
if "contact" in lower or "recent" in lower or "last" in lower:
return f"""
SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone,
lc.last_contact_at AS last_contacted_at, lc.last_channel AS last_contact_channel,
lc.last_interaction_type, lc.days_since_contact, lc.total_interactions AS interaction_count,
nba.recommended_action AS next_action, q.current_value AS qd_score
FROM crm_people p
LEFT JOIN read_last_contacted lc ON lc.person_id = p.person_id
LEFT JOIN read_next_best_action nba ON nba.person_id = p.person_id
LEFT JOIN LATERAL (
SELECT current_value FROM intel_qd_scores q
WHERE q.person_id = p.person_id
ORDER BY q.current_value DESC, q.computed_at DESC
LIMIT 1
) q ON TRUE
WHERE lc.last_contact_at IS NOT NULL
ORDER BY lc.last_contact_at DESC
LIMIT {limit}
"""
if "4 bhk" in lower or "budget" in lower or "interest" in lower or "property" in lower or "client" in lower:
return self._base_property_interest_sql(limit)
return self._generic_clients_sql(limit)
natural_db_agent = NaturalDbAgent()