Project_Velocity/backend/oracle/natural_db_agent.py

"""
Natural DB-first Oracle agent.

Pipeline:
1. schema introspection
2. semantic SQL planning
3. plan verification and optional repair
4. SQL execution
5. execution quality profiling and auto-replan
6. visualization planning from actual result shape
"""
from __future__ import annotations

import json
import logging
import os
import re
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any

from backend.services.runtime_llm_service import runtime_llm_service
from .execution_profiler import execution_profiler
from .plan_verifier import plan_verifier
from .semantic_catalog import CATALOG_VERSION, build_semantic_context_for_planner
from .visualization_planner import VisualizationDecision, visualization_planner

try:
    import asyncpg  # type: ignore
except Exception:  # pragma: no cover
    asyncpg = None  # type: ignore

logger = logging.getLogger(__name__)

DESTRUCTIVE_SQL = re.compile(
    r"\b(insert|update|delete|drop|alter|truncate|copy|create|grant|revoke|call|execute|do|merge)\b",
    re.IGNORECASE,
)
TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE)
_MAX_REPLAN_ATTEMPTS = 2


def _json_safe(value: Any) -> Any:
    if isinstance(value, (datetime, date)):
        return value.isoformat()
    if isinstance(value, Decimal):
        return float(value)
    if isinstance(value, (list, tuple)):
        return [_json_safe(item) for item in value]
    if isinstance(value, dict):
        return {str(key): _json_safe(item) for key, item in value.items()}
    return value


def db_ready() -> bool:
    if asyncpg is None:
        return False
    read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "")
    if read_database_url and not read_database_url.startswith("PLACEHOLDER"):
        return True
    database_url = os.getenv("DATABASE_URL", "")
    return bool(database_url and not database_url.startswith("PLACEHOLDER")) or all(
        os.getenv(name) for name in ("VELOCITY_DB_NAME", "VELOCITY_DB_USER", "VELOCITY_DB_PASSWORD")
    )


async def connect_db() -> Any:
    if asyncpg is None:
        raise RuntimeError("asyncpg is not installed.")

    read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "")
    if read_database_url and not read_database_url.startswith("PLACEHOLDER"):
        return await asyncpg.connect(read_database_url)

    if all(os.getenv(name) for name in ("VELOCITY_DB_READ_NAME", "VELOCITY_DB_READ_USER", "VELOCITY_DB_READ_PASSWORD")):
        return await asyncpg.connect(
            host=os.getenv("VELOCITY_DB_READ_HOST", os.getenv("VELOCITY_DB_HOST", "127.0.0.1")),
            port=int(os.getenv("VELOCITY_DB_READ_PORT", os.getenv("VELOCITY_DB_PORT", "5432"))),
            database=os.environ["VELOCITY_DB_READ_NAME"],
            user=os.environ["VELOCITY_DB_READ_USER"],
            password=os.environ["VELOCITY_DB_READ_PASSWORD"],
        )

    database_url = os.getenv("DATABASE_URL", "")
    if database_url and not database_url.startswith("PLACEHOLDER"):
        return await asyncpg.connect(database_url)

    return await asyncpg.connect(
        host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"),
        port=int(os.getenv("VELOCITY_DB_PORT", "5432")),
        database=os.environ["VELOCITY_DB_NAME"],
        user=os.environ["VELOCITY_DB_USER"],
        password=os.environ["VELOCITY_DB_PASSWORD"],
    )


@dataclass
class NaturalQueryResult:
    prompt: str
    sql: str
    title: str
    summary: str
    columns: list[str]
    rows: list[dict[str, Any]]
    row_count: int
    source_tables: list[str]
    component_type: str
    warnings: list[str]
    visualization_decision: VisualizationDecision | None = None
    replan_count: int = 0
    semantic_catalog_version: str = CATALOG_VERSION

    def as_dict(self) -> dict[str, Any]:
        decision = self.visualization_decision
        return {
            "prompt": self.prompt,
            "sql": self.sql,
            "title": self.title,
            "summary": self.summary,
            "columns": self.columns,
            "rows": self.rows,
            "rowCount": self.row_count,
            "sourceTables": self.source_tables,
            "componentType": self.component_type,
            "warnings": self.warnings,
            "semanticCatalogVersion": self.semantic_catalog_version,
            "replanCount": self.replan_count,
            "visualizationDecision": {
                "xAxis": decision.x_axis,
                "yAxis": decision.y_axis,
                "dimensionCols": decision.dimension_cols,
                "measureCols": decision.measure_cols,
                "widthMode": decision.width_mode,
                "minHeightPx": decision.min_height_px,
                "skeletonVariant": decision.skeleton_variant,
                "vizParams": decision.viz_params,
                "dataBindings": decision.data_bindings,
                "confidence": decision.confidence,
                "reasoning": decision.reasoning,
            }
            if decision
            else {},
        }


def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]:
    warnings: list[str] = []
    clean = re.sub(r"--.*?$|/\*.*?\*/", "", sql.strip(), flags=re.MULTILINE | re.DOTALL).strip().rstrip(";")
    if not re.match(r"^(select|with)\b", clean, re.IGNORECASE):
        raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.")
    if DESTRUCTIVE_SQL.search(clean):
        raise ValueError("Oracle SQL agent blocked non-read SQL.")

    tables: list[str] = []
    for match in TABLE_REF_RE.finditer(clean):
        table = match.group(1).split(".")[-1].strip('"').lower()
        if table in {"lateral", "select"}:
            continue
        if table and table not in tables:
            tables.append(table)

    if "limit" not in clean.lower():
        clean += f" LIMIT {row_limit}"
        warnings.append(f"Row cap {row_limit} auto-applied (query had no LIMIT).")
    return clean, tables, warnings


def _detect_intents(prompt: str) -> list[str]:
    lowered = prompt.lower()
    intents: list[str] = []

    if any(token in lowered for token in (
        "last contact", "last contacted", "recently contacted", "last call",
        "last message", "last whatsapp", "contacted us", "follow-up", "follow up",
        "days since", "no contact",
    )):
        intents.append("last_contacted")

    if any(token in lowered for token in (
        "interested in", "shown interest", "interest in", "interested clients",
        "project interest", "property interest",
    )):
        intents.append("interested_clients")

    if any(token in lowered for token in ("qd score", "qualification score", "desire score", "intent score", "qd")):
        intents.append("qd_score")

    if any(token in lowered for token in ("pipeline", "stage", "funnel", "kanban", "deal")):
        intents.append("pipeline")

    if any(token in lowered for token in ("site visit", "visited", "visit")):
        intents.append("site_visits")

    if any(token in lowered for token in ("call", "transcript", "whatsapp", "email", "message", "conversation", "interaction", "timeline", "activity")):
        intents.append("timeline")

    if any(token in lowered for token in ("objection", "concern", "complaint", "pushback")):
        intents.append("objections")

    if any(token in lowered for token in ("broker", "agent performance", "referral")):
        intents.append("broker_performance")

    if any(token in lowered for token in ("next action", "next step", "what should i do", "follow-up priority", "action queue")):
        intents.append("next_action")

    if any(token in lowered for token in ("project", "unit", "inventory", "available", "price", "configuration")):
        intents.append("inventory")

    if any(token in lowered for token in ("client 360", "dossier", "profile")):
        intents.append("client_360")

    if any(token in lowered for token in ("fact", "memory", "promise", "commitment", "budget", "preference")):
        intents.append("extracted_facts")

    return intents or ["last_contacted"]


def title_from_prompt(prompt: str) -> str:
    words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!")
    return (words[:1].upper() + words[1:80]) if words else "Oracle Query Result"


class NaturalDbAgent:
    async def schema_catalog(self, conn: Any | None = None) -> dict[str, Any]:
        own_conn = conn is None
        if conn is None:
            if not db_ready():
                return {"tables": [], "available": False}
            conn = await connect_db()
        try:
            table_names = await conn.fetch(
                """
                SELECT table_name
                FROM information_schema.tables
                WHERE table_schema = 'public' AND table_type = 'BASE TABLE'
                ORDER BY table_name
                """
            )
            public_tables = [row["table_name"] for row in table_names]
            rows = await conn.fetch(
                """
                SELECT c.table_name, c.column_name, c.data_type, c.udt_name, c.is_nullable
                FROM information_schema.columns c
                WHERE c.table_schema = 'public'
                ORDER BY c.table_name, c.ordinal_position
                """
            )
            counts: dict[str, int | None] = {}
            for table in public_tables:
                exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}")
                counts[table] = None if not exists else int(await conn.fetchval(f'SELECT COUNT(*) FROM "{table}"'))

            tables: dict[str, dict[str, Any]] = {}
            for row in rows:
                entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])})
                entry["columns"].append(
                    {
                        "name": row["column_name"],
                        "dataType": row["data_type"],
                        "udtName": row["udt_name"],
                        "nullable": row["is_nullable"] == "YES",
                    }
                )
            return {"available": True, "tables": tables, "allowedTables": public_tables}
        finally:
            if own_conn:
                await conn.close()

    async def data_health(self, conn: Any | None = None) -> dict[str, Any]:
        catalog = await self.schema_catalog(conn)
        expected = {
            "crm_people": 341,
            "crm_leads": 250,
            "crm_opportunities": 400,
            "crm_property_interests": 400,
            "intel_interactions": 1897,
            "intel_messages": 6944,
            "intel_calls": 478,
            "intel_transcripts": 231,
            "intel_emails": 149,
            "intel_visits": 305,
            "intel_reminders": 759,
            "intel_extracted_facts": 1686,
            "read_last_contacted": 250,
            "read_next_best_action": 250,
        }
        tables = catalog.get("tables", {})
        counts = {table: (meta or {}).get("rowCount") for table, meta in sorted(tables.items())}
        return {
            "counts": counts,
            "expectedSyntheticV2Counts": expected,
            "missingTables": [table for table, count in counts.items() if count is None],
            "emptyTables": [table for table, count in counts.items() if count == 0],
            "belowExpected": {
                table: {"expected": expected_count, "actual": counts.get(table)}
                for table, expected_count in expected.items()
                if (counts.get(table) or 0) < expected_count
            },
        }

    async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult:
        if not prompt.strip():
            raise ValueError("Prompt is required.")

        own_conn = conn is None
        if conn is None:
            if not db_ready():
                raise RuntimeError("Database unavailable for Oracle natural query.")
            conn = await connect_db()
        try:
            catalog = await self.schema_catalog(conn)
            detected_intents = _detect_intents(prompt)
            return await self._pipeline(
                conn=conn,
                prompt=prompt,
                catalog=catalog,
                detected_intents=detected_intents,
                row_limit=row_limit,
                attempt=0,
                prior_feedback=None,
            )
        finally:
            if own_conn:
                await conn.close()

    async def _pipeline(
        self,
        *,
        conn: Any,
        prompt: str,
        catalog: dict[str, Any],
        detected_intents: list[str],
        row_limit: int,
        attempt: int,
        prior_feedback: str | None,
    ) -> NaturalQueryResult:
        warnings: list[str] = []

        plan = await self._plan_sql(
            prompt=prompt,
            catalog=catalog,
            detected_intents=detected_intents,
            row_limit=row_limit,
            prior_feedback=prior_feedback,
        )
        raw_sql = str(plan.get("sql") or "").strip()
        if not raw_sql:
            raise RuntimeError("Natural SQL planner returned no SQL.")

        verification = await plan_verifier.verify_and_repair(
            sql=raw_sql,
            prompt=prompt,
            detected_intents=detected_intents,
            row_limit=row_limit,
            llm_service=runtime_llm_service,
        )
        if verification.was_repaired:
            warnings.append(
                "Plan verifier repaired violations: "
                + ", ".join(violation.rule for violation in verification.violations if violation.severity == "blocking")
            )
        if not verification.passed and verification.repair_failed:
            warnings.append("Plan verifier found violations but repair failed. Proceeding with original SQL.")
        if verification.notes:
            warnings.extend(verification.notes)

        effective_sql, source_tables, sanitize_warnings = sanitize_sql(verification.sql, row_limit)
        warnings.extend(sanitize_warnings)

        try:
            records = await conn.fetch(effective_sql)
        except Exception as exc:
            raise RuntimeError(f"Natural SQL execution failed: {exc}") from exc

        rows = [_json_safe(dict(record)) for record in records]
        columns = list(rows[0].keys()) if rows else []

        profile = execution_profiler.profile(
            rows=rows,
            columns=columns,
            sql=effective_sql,
            prompt=prompt,
            source_tables=source_tables,
            row_limit=row_limit,
        )

        if not profile.passed and attempt < _MAX_REPLAN_ATTEMPTS:
            feedback = " | ".join(profile.replan_hints)
            warnings.append(f"Auto-replan triggered (attempt {attempt + 1}): {feedback[:160]}")
            return await self._pipeline(
                conn=conn,
                prompt=prompt,
                catalog=catalog,
                detected_intents=detected_intents,
                row_limit=row_limit,
                attempt=attempt + 1,
                prior_feedback=feedback,
            )

        if not profile.passed:
            for issue in profile.issues:
                if issue.severity == "blocking":
                    warnings.append(f"Quality issue after {attempt} replans: [{issue.code}] {issue.description}")

        visualization_decision = visualization_planner.plan(
            rows=rows,
            columns=columns,
            prompt=prompt,
            source_tables=source_tables,
            profile_suggested_type=profile.suggested_component_type,
            title_from_planner=str(plan.get("title") or ""),
        )

        title = visualization_decision.title or str(plan.get("title") or title_from_prompt(prompt))
        summary = str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(source_tables) or 'Velocity CRM'}.")

        return NaturalQueryResult(
            prompt=prompt,
            sql=effective_sql,
            title=title,
            summary=summary,
            columns=columns,
            rows=rows,
            row_count=len(rows),
            source_tables=source_tables,
            component_type=visualization_decision.component_type,
            warnings=warnings,
            visualization_decision=visualization_decision,
            replan_count=attempt,
            semantic_catalog_version=CATALOG_VERSION,
        )

    async def _plan_sql(
        self,
        *,
        prompt: str,
        catalog: dict[str, Any],
        detected_intents: list[str],
        row_limit: int,
        prior_feedback: str | None = None,
    ) -> dict[str, Any]:
        try:
            providers = runtime_llm_service._provider_catalog()
        except Exception:
            providers = {}
        if not providers:
            raise RuntimeError("No runtime LLM providers configured for Oracle natural planning.")

        schema_full = catalog.get("tables", {})
        relevant_tables = self._relevant_tables_for_intents(detected_intents)
        schema_brief_dict = {
            table: meta
            for table, meta in schema_full.items()
            if table in relevant_tables or table in {"crm_people", "crm_leads", "inventory_projects", "inventory_units"}
        }
        schema_brief = json.dumps(schema_brief_dict, default=str)[:14000]
        semantic_context = build_semantic_context_for_planner(detected_intents, max_concepts=5)

        replan_section = ""
        if prior_feedback:
            replan_section = (
                f"\n\nPREVIOUS ATTEMPT FAILED - EXECUTION FEEDBACK:\n{prior_feedback}\n"
                "You must address the feedback and change the query accordingly."
            )

        response = await runtime_llm_service.chat(
            provider_id="sglang",
            model=None,
            system_prompt=(
                "You are Oracle's read-only PostgreSQL planner for Project Velocity CRM. "
                "Use the semantic catalog as the business source of truth, not raw column guessing. "
                "Generate exactly one SELECT or WITH query. "
                "Return strict JSON with keys: sql, title, rationale. "
                "Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements."
            ),
            messages=[
                {
                    "role": "user",
                    "content": (
                        f"SEMANTIC CATALOG:\n{semantic_context}\n\n"
                        f"RAW SCHEMA:\n{schema_brief}\n\n"
                        f"DETECTED INTENTS: {', '.join(detected_intents)}\n\n"
                        f"USER QUESTION:\n{prompt}\n\n"
                        f"ROW CAP: {row_limit}\n"
                        f"{replan_section}\n\n"
                        "Return strict JSON: {\"sql\": \"...\", \"title\": \"...\", \"rationale\": \"...\"}"
                    ),
                }
            ],
            temperature=0.05,
            response_format="json",
            metadata={
                "agent": "oracle_natural_db_agent_v2",
                "intents": detected_intents,
                "catalog_version": CATALOG_VERSION,
            },
        )
        message = response.get("message") or {}
        parsed = message.get("parsedJson")
        content = message.get("content") or "{}"
        if not isinstance(parsed, dict):
            parsed = json.loads(content) if isinstance(content, str) else content
        if isinstance(parsed, dict) and parsed.get("sql"):
            return parsed
        raise RuntimeError("Natural DB planner returned no valid SQL.")

    @staticmethod
    def _relevant_tables_for_intents(intents: list[str]) -> set[str]:
        intent_tables: dict[str, set[str]] = {
            "last_contacted": {
                "intel_interactions",
                "crm_people",
                "crm_leads",
                "read_last_contacted",
                "crm_last_contact_read_model",
            },
            "interested_clients": {
                "crm_property_interests",
                "crm_people",
                "inventory_projects",
                "intel_qd_scores",
            },
            "qd_score": {"intel_qd_scores", "crm_people"},
            "pipeline": {"crm_opportunities", "crm_leads", "crm_people", "inventory_projects"},
            "site_visits": {"intel_visits", "crm_people", "inventory_projects"},
            "timeline": {
                "intel_interactions",
                "intel_calls",
                "intel_whatsapp_threads",
                "intel_messages",
                "intel_emails",
                "intel_visits",
                "crm_people",
            },
            "objections": {"intel_call_objections", "crm_people", "inventory_projects"},
            "broker_performance": {"crm_leads", "crm_opportunities", "crm_people"},
            "next_action": {"read_next_best_action", "crm_people"},
            "inventory": {"inventory_projects", "inventory_units", "crm_property_interests"},
            "client_360": {
                "crm_people",
                "crm_leads",
                "intel_qd_scores",
                "crm_property_interests",
                "crm_opportunities",
                "intel_interactions",
                "read_last_contacted",
                "read_next_best_action",
            },
            "extracted_facts": {"intel_extracted_facts", "crm_people"},
        }
        tables: set[str] = set()
        for intent in intents:
            tables.update(intent_tables.get(intent, set()))
        return tables


natural_db_agent = NaturalDbAgent()