""" Natural DB-first Oracle agent. The LLM can plan arbitrary analytical SELECT statements over the Velocity CRM, intel, inventory, and read-model tables. The executor enforces a read-only SQL contract and a UI row cap; write paths stay behind typed API endpoints. """ from __future__ import annotations import json import logging import os import re from dataclasses import dataclass from datetime import date, datetime from decimal import Decimal from typing import Any from backend.services.runtime_llm_service import runtime_llm_service try: import asyncpg # type: ignore except Exception: # pragma: no cover asyncpg = None # type: ignore logger = logging.getLogger(__name__) MAX_ROW_CAP = 500 ALLOWED_TABLES = { "crm_people", "crm_leads", "crm_accounts", "crm_households", "crm_relationships", "crm_opportunities", "crm_property_interests", "crm_stage_history", "intel_interactions", "intel_messages", "intel_calls", "intel_transcripts", "intel_emails", "intel_email_threads", "intel_whatsapp_threads", "intel_visits", "intel_reminders", "intel_qd_scores", "intel_qd_timeseries", "intel_extracted_facts", "intel_call_objections", "intel_cctv_links", "intel_perception_events", "intel_vehicle_events", "inventory_projects", "inventory_units", "read_last_contacted", "read_next_best_action", } DESTRUCTIVE_SQL = re.compile( r"\b(insert|update|delete|drop|alter|truncate|copy|create|grant|revoke|call|execute|do|merge)\b", re.IGNORECASE, ) TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE) def _json_safe(value: Any) -> Any: if isinstance(value, (datetime, date)): return value.isoformat() if isinstance(value, Decimal): return float(value) if isinstance(value, (list, tuple)): return [_json_safe(v) for v in value] if isinstance(value, dict): return {str(k): _json_safe(v) for k, v in value.items()} return value def db_ready() -> bool: if asyncpg is None: return False database_url = os.getenv("DATABASE_URL", "") return bool(database_url and not database_url.startswith("PLACEHOLDER")) or all( os.getenv(name) for name in ("VELOCITY_DB_NAME", "VELOCITY_DB_USER", "VELOCITY_DB_PASSWORD") ) async def connect_db() -> Any: if asyncpg is None: raise RuntimeError("asyncpg is not installed.") database_url = os.getenv("DATABASE_URL", "") if database_url and not database_url.startswith("PLACEHOLDER"): return await asyncpg.connect(database_url) return await asyncpg.connect( host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"), port=int(os.getenv("VELOCITY_DB_PORT", "5432")), database=os.environ["VELOCITY_DB_NAME"], user=os.environ["VELOCITY_DB_USER"], password=os.environ["VELOCITY_DB_PASSWORD"], ) @dataclass class NaturalQueryResult: prompt: str sql: str title: str summary: str columns: list[str] rows: list[dict[str, Any]] row_count: int source_tables: list[str] component_type: str warnings: list[str] def as_dict(self) -> dict[str, Any]: return { "prompt": self.prompt, "sql": self.sql, "title": self.title, "summary": self.summary, "columns": self.columns, "rows": self.rows, "rowCount": self.row_count, "sourceTables": self.source_tables, "componentType": self.component_type, "warnings": self.warnings, } def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]: warnings: list[str] = [] clean = re.sub(r"--.*?$|/\*.*?\*/", "", sql.strip(), flags=re.MULTILINE | re.DOTALL).strip().rstrip(";") if not re.match(r"^(select|with)\b", clean, re.IGNORECASE): raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.") if DESTRUCTIVE_SQL.search(clean): raise ValueError("Oracle SQL agent blocked non-read SQL.") tables = [] for match in TABLE_REF_RE.finditer(clean): table = match.group(1).split(".")[-1].strip('"').lower() if table in {"lateral", "select"}: continue if table and table not in tables: tables.append(table) blocked = [table for table in tables if table not in ALLOWED_TABLES] if blocked: raise ValueError(f"Oracle SQL agent blocked unknown tables: {', '.join(blocked)}") capped = max(1, min(int(row_limit or 100), MAX_ROW_CAP)) if not re.search(r"\blimit\s+\d+\b", clean, re.IGNORECASE): clean = f"SELECT * FROM ({clean}) oracle_limited_rows LIMIT {capped}" warnings.append(f"Applied UI row cap LIMIT {capped}.") return clean, tables, warnings def infer_component_type(prompt: str, columns: list[str], rows: list[dict[str, Any]]) -> str: lower = prompt.lower() if any(term in lower for term in ("timeline", "conversation", "whatsapp", "message", "call", "email", "history")): return "activity_stream" if len(rows) == 1 and len(columns) <= 5 and any(isinstance(rows[0].get(c), (int, float)) for c in columns): return "kpi_tile" if any(c.endswith("_at") or c in {"date", "when", "timestamp", "happened_at"} for c in columns): if len(rows) > 1 and any(term in lower for term in ("trend", "over time", "timeseries")): return "line_chart" if any(term in lower for term in ("timeline", "activity", "last", "recent")): return "activity_stream" numeric_cols = [c for c in columns if rows and isinstance(rows[0].get(c), (int, float))] if numeric_cols and any(term in lower for term in ("count", "compare", "distribution", "most", "top", "by ")): return "bar_chart" return "table" def title_from_prompt(prompt: str) -> str: words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!") return words[:1].upper() + words[1:80] if words else "Oracle Query Result" class NaturalDbAgent: async def schema_catalog(self, conn: Any | None = None) -> dict[str, Any]: own_conn = conn is None if conn is None: if not db_ready(): return {"tables": [], "available": False} conn = await connect_db() try: rows = await conn.fetch( """ SELECT c.table_name, c.column_name, c.data_type, c.udt_name, c.is_nullable FROM information_schema.columns c WHERE c.table_schema = 'public' AND c.table_name = ANY($1::text[]) ORDER BY c.table_name, c.ordinal_position """, sorted(ALLOWED_TABLES), ) counts = {} for table in sorted(ALLOWED_TABLES): exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}") counts[table] = None if not exists else int(await conn.fetchval(f"SELECT COUNT(*) FROM {table}")) tables: dict[str, dict[str, Any]] = {} for row in rows: entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])}) entry["columns"].append({ "name": row["column_name"], "dataType": row["data_type"], "udtName": row["udt_name"], "nullable": row["is_nullable"] == "YES", }) return {"available": True, "tables": tables, "allowedTables": sorted(ALLOWED_TABLES)} finally: if own_conn: await conn.close() async def data_health(self, conn: Any | None = None) -> dict[str, Any]: catalog = await self.schema_catalog(conn) expected = { "crm_people": 341, "crm_leads": 250, "crm_opportunities": 400, "crm_property_interests": 400, "intel_interactions": 1897, "intel_messages": 6944, "intel_calls": 478, "intel_transcripts": 231, "intel_emails": 149, "intel_visits": 305, "intel_reminders": 759, "intel_extracted_facts": 1686, "read_last_contacted": 250, "read_next_best_action": 250, } tables = catalog.get("tables", {}) counts = {table: (tables.get(table) or {}).get("rowCount") for table in sorted(ALLOWED_TABLES)} return { "counts": counts, "expectedSyntheticV2Counts": expected, "missingTables": [t for t, count in counts.items() if count is None], "emptyTables": [t for t, count in counts.items() if count == 0], "belowExpected": {t: {"expected": e, "actual": counts.get(t)} for t, e in expected.items() if (counts.get(t) or 0) < e}, } async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult: if not prompt.strip(): raise ValueError("Prompt is required.") own_conn = conn is None if conn is None: if not db_ready(): raise RuntimeError("Database unavailable for Oracle natural query.") conn = await connect_db() try: catalog = await self.schema_catalog(conn) plan = await self._plan_sql(prompt, catalog, row_limit) return await self._run_plan(conn, prompt, plan, row_limit) finally: if own_conn: await conn.close() async def _run_plan(self, conn: Any, prompt: str, plan: dict[str, Any], row_limit: int) -> NaturalQueryResult: raw_sql = str(plan.get("sql") or "").strip() if not raw_sql: raw_sql = self._fallback_sql(prompt, row_limit) sql, tables, warnings = sanitize_sql(raw_sql, row_limit) try: records = await conn.fetch(sql) except Exception as exc: retry = await self._repair_sql(prompt, raw_sql, str(exc), row_limit) sql, tables, retry_warnings = sanitize_sql(retry, row_limit) warnings.extend(retry_warnings) warnings.append(f"Initial SQL repaired after database error: {exc}") records = await conn.fetch(sql) if not records: retry_sql = self._zero_row_retry_sql(prompt, row_limit, raw_sql) if retry_sql and retry_sql.strip() != raw_sql.strip(): retry_clean, retry_tables, retry_warnings = sanitize_sql(retry_sql, row_limit) retry_records = await conn.fetch(retry_clean) if retry_records: sql = retry_clean tables = retry_tables records = retry_records warnings.extend(retry_warnings) warnings.append("Initial SQL returned zero rows; Oracle retried with a broader CRM read query.") rows = [_json_safe(dict(record)) for record in records] columns = list(rows[0].keys()) if rows else [] component_type = infer_component_type(prompt, columns, rows) return NaturalQueryResult( prompt=prompt, sql=sql, title=str(plan.get("title") or title_from_prompt(prompt)), summary=str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(tables) or 'Velocity CRM'}."), columns=columns, rows=rows, row_count=len(rows), source_tables=tables, component_type=component_type, warnings=warnings, ) async def _plan_sql(self, prompt: str, catalog: dict[str, Any], row_limit: int) -> dict[str, Any]: fallback = {"sql": self._fallback_sql(prompt, row_limit), "title": title_from_prompt(prompt), "rationale": "Deterministic SQL planner fallback."} try: providers = runtime_llm_service._provider_catalog() except Exception: providers = {} if not providers: return fallback schema_brief = json.dumps(catalog.get("tables", {}), default=str)[:16000] system = ( "You are Oracle's read-only PostgreSQL planner. Generate one useful SELECT or WITH query " "for the user's CRM question. Use only the provided schema. Return JSON with sql, title, rationale. " "Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements." ) try: response = await runtime_llm_service.chat( provider_id="sglang", model=None, system_prompt=system, messages=[{"role": "user", "content": f"Schema:\n{schema_brief}\n\nQuestion:\n{prompt}\n\nRow cap: {row_limit}"}], temperature=0.05, response_format="json", metadata={"agent": "oracle_natural_db_agent"}, ) message = response.get("message") or {} parsed = message.get("parsedJson") content = message.get("content") or "{}" if not isinstance(parsed, dict): parsed = json.loads(content) if isinstance(content, str) else content if isinstance(parsed, dict) and parsed.get("sql"): return parsed except Exception as exc: logger.warning("Natural DB planner LLM failed, using fallback: %s", exc) return fallback async def _repair_sql(self, prompt: str, failed_sql: str, error: str, row_limit: int) -> str: # Keep retry operationally deterministic if model is unavailable. if "read_last_contacted" in failed_sql and "does not exist" in error.lower(): return self._base_last_contacted_sql(row_limit) if "read_next_best_action" in failed_sql and "does not exist" in error.lower(): return self._base_last_contacted_sql(row_limit) return self._fallback_sql(prompt, row_limit) def _zero_row_retry_sql(self, prompt: str, row_limit: int, previous_sql: str) -> str | None: lower = prompt.lower() if any(term in lower for term in ("contact", "recent", "last", "call", "message", "email", "whatsapp", "follow")): return self._base_last_contacted_sql(row_limit) if any(term in lower for term in ("interest", "interested", "property", "project", "unit", "budget", "bhk")): return self._base_property_interest_sql(row_limit) if "from crm_people" not in previous_sql.lower(): return self._generic_clients_sql(row_limit) return None def _base_last_contacted_sql(self, row_limit: int) -> str: limit = max(1, min(row_limit, MAX_ROW_CAP)) return f""" WITH contact_events AS ( SELECT i.person_id, i.happened_at AS event_at, i.channel::text AS channel, i.interaction_type AS event_type, i.summary AS summary, i.broker_name AS actor FROM intel_interactions i WHERE i.happened_at IS NOT NULL UNION ALL SELECT i.person_id, m.delivered_at, 'message', COALESCE(m.sender_role, 'message'), m.message_text, m.sender_name FROM intel_messages m JOIN intel_interactions i ON i.interaction_id = m.interaction_id WHERE m.delivered_at IS NOT NULL UNION ALL SELECT i.person_id, e.sent_at, 'email', COALESCE(e.direction::text, 'email'), e.subject, e.from_address FROM intel_emails e JOIN intel_interactions i ON i.interaction_id = e.interaction_id WHERE e.sent_at IS NOT NULL UNION ALL SELECT v.person_id, v.visited_at, 'site_visit', 'visit', v.outcome, v.hosted_by FROM intel_visits v WHERE v.visited_at IS NOT NULL ), ranked AS ( SELECT *, row_number() OVER (PARTITION BY person_id ORDER BY event_at DESC) AS rn, count(*) OVER (PARTITION BY person_id) AS interaction_count FROM contact_events ) SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone, p.primary_email AS email, r.event_at AS last_contacted_at, r.channel AS last_contact_channel, r.event_type AS last_interaction_type, r.summary AS last_contact_summary, r.actor AS last_contact_actor, r.interaction_count::int, q.current_value AS qd_score FROM ranked r JOIN crm_people p ON p.person_id = r.person_id LEFT JOIN LATERAL ( SELECT current_value FROM intel_qd_scores q WHERE q.person_id = p.person_id ORDER BY q.current_value DESC, q.computed_at DESC LIMIT 1 ) q ON TRUE WHERE r.rn = 1 ORDER BY r.event_at DESC LIMIT {limit} """ def _base_property_interest_sql(self, row_limit: int) -> str: limit = max(1, min(row_limit, MAX_ROW_CAP)) return f""" SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone, p.primary_email AS email, COUNT(pi.interest_id)::int AS interest_count, string_agg(DISTINCT COALESCE(pi.project_name, pr.project_name), ', ') AS projects, string_agg(DISTINCT pi.configuration, ', ') AS configurations, MIN(pi.budget_min) AS budget_min, MAX(pi.budget_max) AS budget_max, MAX(pi.last_discussed_at) AS last_interest_at, MAX(q.current_value) AS qd_score FROM crm_people p JOIN crm_property_interests pi ON pi.person_id = p.person_id LEFT JOIN inventory_projects pr ON pr.project_id = pi.project_id LEFT JOIN intel_qd_scores q ON q.person_id = p.person_id GROUP BY p.person_id, p.full_name, p.primary_phone, p.primary_email HAVING COUNT(pi.interest_id) > 0 ORDER BY interest_count DESC, qd_score DESC NULLS LAST, last_interest_at DESC NULLS LAST LIMIT {limit} """ def _generic_clients_sql(self, row_limit: int) -> str: limit = max(1, min(row_limit, MAX_ROW_CAP)) return f""" SELECT p.person_id::text, p.full_name AS name, p.primary_email AS email, p.primary_phone AS phone, p.buyer_type, l.status::text AS lead_status, l.budget_band, l.urgency, q.current_value AS qd_score FROM crm_people p LEFT JOIN LATERAL ( SELECT * FROM crm_leads l WHERE l.person_id = p.person_id ORDER BY l.updated_at DESC LIMIT 1 ) l ON TRUE LEFT JOIN LATERAL ( SELECT current_value FROM intel_qd_scores q WHERE q.person_id = p.person_id ORDER BY q.current_value DESC, q.computed_at DESC LIMIT 1 ) q ON TRUE ORDER BY qd_score DESC NULLS LAST, p.full_name ASC LIMIT {limit} """ def _fallback_sql(self, prompt: str, row_limit: int) -> str: lower = prompt.lower() limit = max(1, min(row_limit, MAX_ROW_CAP)) if "objection" in lower: return f""" SELECT p.person_id::text, p.full_name AS name, co.objection_type, co.category, co.severity, co.client_quote, co.agent_response, co.extracted_at FROM intel_call_objections co JOIN intel_calls c ON c.call_id = co.call_id JOIN intel_interactions i ON i.interaction_id = c.interaction_id JOIN crm_people p ON p.person_id = i.person_id ORDER BY co.extracted_at DESC LIMIT {limit} """ if "whatsapp" in lower or "message" in lower or "conversation" in lower: return f""" SELECT p.person_id::text, p.full_name AS name, 'whatsapp' AS type, m.message_text AS summary, m.sender_role AS actor, m.delivered_at AS date FROM intel_messages m JOIN intel_interactions i ON i.interaction_id = m.interaction_id JOIN crm_people p ON p.person_id = i.person_id WHERE lower(m.message_text) LIKE '%' || lower(split_part($${prompt}$$, ' ', 1)) || '%' OR i.channel = 'whatsapp' ORDER BY m.delivered_at DESC LIMIT {limit} """ if "contact" in lower or "recent" in lower or "last" in lower: return f""" SELECT p.person_id::text, p.full_name AS name, p.primary_phone AS phone, lc.last_contact_at AS last_contacted_at, lc.last_channel AS last_contact_channel, lc.last_interaction_type, lc.days_since_contact, lc.total_interactions AS interaction_count, nba.recommended_action AS next_action, q.current_value AS qd_score FROM crm_people p LEFT JOIN read_last_contacted lc ON lc.person_id = p.person_id LEFT JOIN read_next_best_action nba ON nba.person_id = p.person_id LEFT JOIN LATERAL ( SELECT current_value FROM intel_qd_scores q WHERE q.person_id = p.person_id ORDER BY q.current_value DESC, q.computed_at DESC LIMIT 1 ) q ON TRUE WHERE lc.last_contact_at IS NOT NULL ORDER BY lc.last_contact_at DESC LIMIT {limit} """ if "4 bhk" in lower or "budget" in lower or "interest" in lower or "property" in lower or "client" in lower: return self._base_property_interest_sql(limit) return self._generic_clients_sql(limit) natural_db_agent = NaturalDbAgent()