""" Natural DB-first Oracle agent. Pipeline: 1. schema introspection 2. semantic SQL planning 3. plan verification and optional repair 4. SQL execution 5. execution quality profiling and auto-replan 6. visualization planning from actual result shape """ from __future__ import annotations import json import logging import os import re from dataclasses import dataclass from datetime import date, datetime from decimal import Decimal from typing import Any from backend.services.runtime_llm_service import runtime_llm_service from .execution_profiler import execution_profiler from .plan_verifier import plan_verifier from .semantic_catalog import CATALOG_VERSION, build_semantic_context_for_planner from .visualization_planner import VisualizationDecision, visualization_planner try: import asyncpg # type: ignore except Exception: # pragma: no cover asyncpg = None # type: ignore logger = logging.getLogger(__name__) DESTRUCTIVE_SQL = re.compile( r"\b(insert|update|delete|drop|alter|truncate|copy|create|grant|revoke|call|execute|do|merge)\b", re.IGNORECASE, ) TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE) _MAX_REPLAN_ATTEMPTS = 2 def _json_safe(value: Any) -> Any: if isinstance(value, (datetime, date)): return value.isoformat() if isinstance(value, Decimal): return float(value) if isinstance(value, (list, tuple)): return [_json_safe(item) for item in value] if isinstance(value, dict): return {str(key): _json_safe(item) for key, item in value.items()} return value def db_ready() -> bool: if asyncpg is None: return False read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "") if read_database_url and not read_database_url.startswith("PLACEHOLDER"): return True database_url = os.getenv("DATABASE_URL", "") return bool(database_url and not database_url.startswith("PLACEHOLDER")) or all( os.getenv(name) for name in ("VELOCITY_DB_NAME", "VELOCITY_DB_USER", "VELOCITY_DB_PASSWORD") ) async def connect_db() -> Any: if asyncpg is None: raise RuntimeError("asyncpg is not installed.") read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "") if read_database_url and not read_database_url.startswith("PLACEHOLDER"): return await asyncpg.connect(read_database_url) if all(os.getenv(name) for name in ("VELOCITY_DB_READ_NAME", "VELOCITY_DB_READ_USER", "VELOCITY_DB_READ_PASSWORD")): return await asyncpg.connect( host=os.getenv("VELOCITY_DB_READ_HOST", os.getenv("VELOCITY_DB_HOST", "127.0.0.1")), port=int(os.getenv("VELOCITY_DB_READ_PORT", os.getenv("VELOCITY_DB_PORT", "5432"))), database=os.environ["VELOCITY_DB_READ_NAME"], user=os.environ["VELOCITY_DB_READ_USER"], password=os.environ["VELOCITY_DB_READ_PASSWORD"], ) database_url = os.getenv("DATABASE_URL", "") if database_url and not database_url.startswith("PLACEHOLDER"): return await asyncpg.connect(database_url) return await asyncpg.connect( host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"), port=int(os.getenv("VELOCITY_DB_PORT", "5432")), database=os.environ["VELOCITY_DB_NAME"], user=os.environ["VELOCITY_DB_USER"], password=os.environ["VELOCITY_DB_PASSWORD"], ) @dataclass class NaturalQueryResult: prompt: str sql: str title: str summary: str columns: list[str] rows: list[dict[str, Any]] row_count: int source_tables: list[str] component_type: str warnings: list[str] visualization_decision: VisualizationDecision | None = None replan_count: int = 0 semantic_catalog_version: str = CATALOG_VERSION def as_dict(self) -> dict[str, Any]: decision = self.visualization_decision return { "prompt": self.prompt, "sql": self.sql, "title": self.title, "summary": self.summary, "columns": self.columns, "rows": self.rows, "rowCount": self.row_count, "sourceTables": self.source_tables, "componentType": self.component_type, "warnings": self.warnings, "semanticCatalogVersion": self.semantic_catalog_version, "replanCount": self.replan_count, "visualizationDecision": { "xAxis": decision.x_axis, "yAxis": decision.y_axis, "dimensionCols": decision.dimension_cols, "measureCols": decision.measure_cols, "widthMode": decision.width_mode, "minHeightPx": decision.min_height_px, "skeletonVariant": decision.skeleton_variant, "vizParams": decision.viz_params, "dataBindings": decision.data_bindings, "confidence": decision.confidence, "reasoning": decision.reasoning, } if decision else {}, } def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]: warnings: list[str] = [] clean = re.sub(r"--.*?$|/\*.*?\*/", "", sql.strip(), flags=re.MULTILINE | re.DOTALL).strip().rstrip(";") if not re.match(r"^(select|with)\b", clean, re.IGNORECASE): raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.") if DESTRUCTIVE_SQL.search(clean): raise ValueError("Oracle SQL agent blocked non-read SQL.") tables: list[str] = [] for match in TABLE_REF_RE.finditer(clean): table = match.group(1).split(".")[-1].strip('"').lower() if table in {"lateral", "select"}: continue if table and table not in tables: tables.append(table) if "limit" not in clean.lower(): clean += f" LIMIT {row_limit}" warnings.append(f"Row cap {row_limit} auto-applied (query had no LIMIT).") return clean, tables, warnings def _detect_intents(prompt: str) -> list[str]: lowered = prompt.lower() intents: list[str] = [] if any(token in lowered for token in ( "last contact", "last contacted", "recently contacted", "last call", "last message", "last whatsapp", "contacted us", "follow-up", "follow up", "days since", "no contact", )): intents.append("last_contacted") if any(token in lowered for token in ( "interested in", "shown interest", "interest in", "interested clients", "project interest", "property interest", )): intents.append("interested_clients") if any(token in lowered for token in ("qd score", "qualification score", "desire score", "intent score", "qd")): intents.append("qd_score") if any(token in lowered for token in ("pipeline", "stage", "funnel", "kanban", "deal")): intents.append("pipeline") if any(token in lowered for token in ("site visit", "visited", "visit")): intents.append("site_visits") if any(token in lowered for token in ("call", "transcript", "whatsapp", "email", "message", "conversation", "interaction", "timeline", "activity")): intents.append("timeline") if any(token in lowered for token in ("objection", "concern", "complaint", "pushback")): intents.append("objections") if any(token in lowered for token in ("broker", "agent performance", "referral")): intents.append("broker_performance") if any(token in lowered for token in ("next action", "next step", "what should i do", "follow-up priority", "action queue")): intents.append("next_action") if any(token in lowered for token in ("project", "unit", "inventory", "available", "price", "configuration")): intents.append("inventory") if any(token in lowered for token in ("client 360", "dossier", "profile")): intents.append("client_360") if any(token in lowered for token in ("fact", "memory", "promise", "commitment", "budget", "preference")): intents.append("extracted_facts") return intents or ["last_contacted"] def title_from_prompt(prompt: str) -> str: words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!") return (words[:1].upper() + words[1:80]) if words else "Oracle Query Result" class NaturalDbAgent: async def schema_catalog(self, conn: Any | None = None) -> dict[str, Any]: own_conn = conn is None if conn is None: if not db_ready(): return {"tables": [], "available": False} conn = await connect_db() try: table_names = await conn.fetch( """ SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_type = 'BASE TABLE' ORDER BY table_name """ ) public_tables = [row["table_name"] for row in table_names] rows = await conn.fetch( """ SELECT c.table_name, c.column_name, c.data_type, c.udt_name, c.is_nullable FROM information_schema.columns c WHERE c.table_schema = 'public' ORDER BY c.table_name, c.ordinal_position """ ) counts: dict[str, int | None] = {} for table in public_tables: exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}") counts[table] = None if not exists else int(await conn.fetchval(f'SELECT COUNT(*) FROM "{table}"')) tables: dict[str, dict[str, Any]] = {} for row in rows: entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])}) entry["columns"].append( { "name": row["column_name"], "dataType": row["data_type"], "udtName": row["udt_name"], "nullable": row["is_nullable"] == "YES", } ) return {"available": True, "tables": tables, "allowedTables": public_tables} finally: if own_conn: await conn.close() async def data_health(self, conn: Any | None = None) -> dict[str, Any]: catalog = await self.schema_catalog(conn) expected = { "crm_people": 341, "crm_leads": 250, "crm_opportunities": 400, "crm_property_interests": 400, "intel_interactions": 1897, "intel_messages": 6944, "intel_calls": 478, "intel_transcripts": 231, "intel_emails": 149, "intel_visits": 305, "intel_reminders": 759, "intel_extracted_facts": 1686, "read_last_contacted": 250, "read_next_best_action": 250, } tables = catalog.get("tables", {}) counts = {table: (meta or {}).get("rowCount") for table, meta in sorted(tables.items())} return { "counts": counts, "expectedSyntheticV2Counts": expected, "missingTables": [table for table, count in counts.items() if count is None], "emptyTables": [table for table, count in counts.items() if count == 0], "belowExpected": { table: {"expected": expected_count, "actual": counts.get(table)} for table, expected_count in expected.items() if (counts.get(table) or 0) < expected_count }, } async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult: if not prompt.strip(): raise ValueError("Prompt is required.") own_conn = conn is None if conn is None: if not db_ready(): raise RuntimeError("Database unavailable for Oracle natural query.") conn = await connect_db() try: catalog = await self.schema_catalog(conn) detected_intents = _detect_intents(prompt) return await self._pipeline( conn=conn, prompt=prompt, catalog=catalog, detected_intents=detected_intents, row_limit=row_limit, attempt=0, prior_feedback=None, ) finally: if own_conn: await conn.close() async def _pipeline( self, *, conn: Any, prompt: str, catalog: dict[str, Any], detected_intents: list[str], row_limit: int, attempt: int, prior_feedback: str | None, ) -> NaturalQueryResult: warnings: list[str] = [] plan = await self._plan_sql( prompt=prompt, catalog=catalog, detected_intents=detected_intents, row_limit=row_limit, prior_feedback=prior_feedback, ) raw_sql = str(plan.get("sql") or "").strip() if not raw_sql: raise RuntimeError("Natural SQL planner returned no SQL.") verification = await plan_verifier.verify_and_repair( sql=raw_sql, prompt=prompt, detected_intents=detected_intents, row_limit=row_limit, llm_service=runtime_llm_service, ) if verification.was_repaired: warnings.append( "Plan verifier repaired violations: " + ", ".join(violation.rule for violation in verification.violations if violation.severity == "blocking") ) if not verification.passed and verification.repair_failed: warnings.append("Plan verifier found violations but repair failed. Proceeding with original SQL.") if verification.notes: warnings.extend(verification.notes) effective_sql, source_tables, sanitize_warnings = sanitize_sql(verification.sql, row_limit) warnings.extend(sanitize_warnings) try: records = await conn.fetch(effective_sql) except Exception as exc: raise RuntimeError(f"Natural SQL execution failed: {exc}") from exc rows = [_json_safe(dict(record)) for record in records] columns = list(rows[0].keys()) if rows else [] profile = execution_profiler.profile( rows=rows, columns=columns, sql=effective_sql, prompt=prompt, source_tables=source_tables, row_limit=row_limit, ) if not profile.passed and attempt < _MAX_REPLAN_ATTEMPTS: feedback = " | ".join(profile.replan_hints) warnings.append(f"Auto-replan triggered (attempt {attempt + 1}): {feedback[:160]}") return await self._pipeline( conn=conn, prompt=prompt, catalog=catalog, detected_intents=detected_intents, row_limit=row_limit, attempt=attempt + 1, prior_feedback=feedback, ) if not profile.passed: for issue in profile.issues: if issue.severity == "blocking": warnings.append(f"Quality issue after {attempt} replans: [{issue.code}] {issue.description}") visualization_decision = visualization_planner.plan( rows=rows, columns=columns, prompt=prompt, source_tables=source_tables, profile_suggested_type=profile.suggested_component_type, title_from_planner=str(plan.get("title") or ""), ) title = visualization_decision.title or str(plan.get("title") or title_from_prompt(prompt)) summary = str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(source_tables) or 'Velocity CRM'}.") return NaturalQueryResult( prompt=prompt, sql=effective_sql, title=title, summary=summary, columns=columns, rows=rows, row_count=len(rows), source_tables=source_tables, component_type=visualization_decision.component_type, warnings=warnings, visualization_decision=visualization_decision, replan_count=attempt, semantic_catalog_version=CATALOG_VERSION, ) async def _plan_sql( self, *, prompt: str, catalog: dict[str, Any], detected_intents: list[str], row_limit: int, prior_feedback: str | None = None, ) -> dict[str, Any]: try: providers = runtime_llm_service._provider_catalog() except Exception: providers = {} if not providers: raise RuntimeError("No runtime LLM providers configured for Oracle natural planning.") schema_full = catalog.get("tables", {}) relevant_tables = self._relevant_tables_for_intents(detected_intents) schema_brief_dict = { table: meta for table, meta in schema_full.items() if table in relevant_tables or table in {"crm_people", "crm_leads", "inventory_projects", "inventory_units"} } schema_brief = json.dumps(schema_brief_dict, default=str)[:14000] semantic_context = build_semantic_context_for_planner(detected_intents, max_concepts=5) replan_section = "" if prior_feedback: replan_section = ( f"\n\nPREVIOUS ATTEMPT FAILED - EXECUTION FEEDBACK:\n{prior_feedback}\n" "You must address the feedback and change the query accordingly." ) response = await runtime_llm_service.chat( provider_id="sglang", model=None, system_prompt=( "You are Oracle's read-only PostgreSQL planner for Project Velocity CRM. " "Use the semantic catalog as the business source of truth, not raw column guessing. " "Generate exactly one SELECT or WITH query. " "Return strict JSON with keys: sql, title, rationale. " "Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements." ), messages=[ { "role": "user", "content": ( f"SEMANTIC CATALOG:\n{semantic_context}\n\n" f"RAW SCHEMA:\n{schema_brief}\n\n" f"DETECTED INTENTS: {', '.join(detected_intents)}\n\n" f"USER QUESTION:\n{prompt}\n\n" f"ROW CAP: {row_limit}\n" f"{replan_section}\n\n" "Return strict JSON: {\"sql\": \"...\", \"title\": \"...\", \"rationale\": \"...\"}" ), } ], temperature=0.05, response_format="json", metadata={ "agent": "oracle_natural_db_agent_v2", "intents": detected_intents, "catalog_version": CATALOG_VERSION, }, ) message = response.get("message") or {} parsed = message.get("parsedJson") content = message.get("content") or "{}" if not isinstance(parsed, dict): parsed = json.loads(content) if isinstance(content, str) else content if isinstance(parsed, dict) and parsed.get("sql"): return parsed raise RuntimeError("Natural DB planner returned no valid SQL.") @staticmethod def _relevant_tables_for_intents(intents: list[str]) -> set[str]: intent_tables: dict[str, set[str]] = { "last_contacted": { "intel_interactions", "crm_people", "crm_leads", "read_last_contacted", "crm_last_contact_read_model", }, "interested_clients": { "crm_property_interests", "crm_people", "inventory_projects", "intel_qd_scores", }, "qd_score": {"intel_qd_scores", "crm_people"}, "pipeline": {"crm_opportunities", "crm_leads", "crm_people", "inventory_projects"}, "site_visits": {"intel_visits", "crm_people", "inventory_projects"}, "timeline": { "intel_interactions", "intel_calls", "intel_whatsapp_threads", "intel_messages", "intel_emails", "intel_visits", "crm_people", }, "objections": {"intel_call_objections", "crm_people", "inventory_projects"}, "broker_performance": {"crm_leads", "crm_opportunities", "crm_people"}, "next_action": {"read_next_best_action", "crm_people"}, "inventory": {"inventory_projects", "inventory_units", "crm_property_interests"}, "client_360": { "crm_people", "crm_leads", "intel_qd_scores", "crm_property_interests", "crm_opportunities", "intel_interactions", "read_last_contacted", "read_next_best_action", }, "extracted_facts": {"intel_extracted_facts", "crm_people"}, } tables: set[str] = set() for intent in intents: tables.update(intent_tables.get(intent, set())) return tables natural_db_agent = NaturalDbAgent()