fix: Oracle Canvas JSON Component Generation planning and orchestration logic

2026-04-24 05:14:11 +05:30
parent 9f27e6a017
commit cf602822b0
6 changed files with 1555 additions and 115 deletions
--- a/backend/oracle/natural_db_agent.py
+++ b/backend/oracle/natural_db_agent.py
@@ -1,9 +1,13 @@
 """
 Natural DB-first Oracle agent.

-The LLM can plan arbitrary analytical SELECT statements over the full public
-Velocity app schema. The executor enforces only a read-only SQL contract and a
-UI row cap; write paths stay behind typed API endpoints.
+Pipeline:
+1. schema introspection
+2. semantic SQL planning
+3. plan verification and optional repair
+4. SQL execution
+5. execution quality profiling and auto-replan
+6. visualization planning from actual result shape
 """
 from __future__ import annotations

@@ -17,6 +21,10 @@ from decimal import Decimal
 from typing import Any

 from backend.services.runtime_llm_service import runtime_llm_service
+from .execution_profiler import execution_profiler
+from .plan_verifier import plan_verifier
+from .semantic_catalog import CATALOG_VERSION, build_semantic_context_for_planner
+from .visualization_planner import VisualizationDecision, visualization_planner

 try:
    import asyncpg  # type: ignore
@@ -30,7 +38,7 @@ DESTRUCTIVE_SQL = re.compile(
    re.IGNORECASE,
 )
 TABLE_REF_RE = re.compile(r"\b(?:from|join)\s+([a-zA-Z_][\w.]*)(?:\s|$)", re.IGNORECASE)
-CTE_NAME_RE = re.compile(r"\b(?:with|,)\s*([a-zA-Z_][\w]*)\s+as\s*\(", re.IGNORECASE)
+_MAX_REPLAN_ATTEMPTS = 2


 def _json_safe(value: Any) -> Any:
@@ -39,9 +47,9 @@ def _json_safe(value: Any) -> Any:
    if isinstance(value, Decimal):
        return float(value)
    if isinstance(value, (list, tuple)):
-        return [_json_safe(v) for v in value]
+        return [_json_safe(item) for item in value]
    if isinstance(value, dict):
-        return {str(k): _json_safe(v) for k, v in value.items()}
+        return {str(key): _json_safe(item) for key, item in value.items()}
    return value


@@ -60,9 +68,11 @@ def db_ready() -> bool:
 async def connect_db() -> Any:
    if asyncpg is None:
        raise RuntimeError("asyncpg is not installed.")
+
    read_database_url = os.getenv("ORACLE_READ_DATABASE_URL", "")
    if read_database_url and not read_database_url.startswith("PLACEHOLDER"):
        return await asyncpg.connect(read_database_url)
+
    if all(os.getenv(name) for name in ("VELOCITY_DB_READ_NAME", "VELOCITY_DB_READ_USER", "VELOCITY_DB_READ_PASSWORD")):
        return await asyncpg.connect(
            host=os.getenv("VELOCITY_DB_READ_HOST", os.getenv("VELOCITY_DB_HOST", "127.0.0.1")),
@@ -71,9 +81,11 @@ async def connect_db() -> Any:
            user=os.environ["VELOCITY_DB_READ_USER"],
            password=os.environ["VELOCITY_DB_READ_PASSWORD"],
        )
+
    database_url = os.getenv("DATABASE_URL", "")
    if database_url and not database_url.startswith("PLACEHOLDER"):
        return await asyncpg.connect(database_url)
+
    return await asyncpg.connect(
        host=os.getenv("VELOCITY_DB_HOST", "127.0.0.1"),
        port=int(os.getenv("VELOCITY_DB_PORT", "5432")),
@@ -95,8 +107,12 @@ class NaturalQueryResult:
    source_tables: list[str]
    component_type: str
    warnings: list[str]
+    visualization_decision: VisualizationDecision | None = None
+    replan_count: int = 0
+    semantic_catalog_version: str = CATALOG_VERSION

    def as_dict(self) -> dict[str, Any]:
+        decision = self.visualization_decision
        return {
            "prompt": self.prompt,
            "sql": self.sql,
@@ -108,6 +124,23 @@ class NaturalQueryResult:
            "sourceTables": self.source_tables,
            "componentType": self.component_type,
            "warnings": self.warnings,
+            "semanticCatalogVersion": self.semantic_catalog_version,
+            "replanCount": self.replan_count,
+            "visualizationDecision": {
+                "xAxis": decision.x_axis,
+                "yAxis": decision.y_axis,
+                "dimensionCols": decision.dimension_cols,
+                "measureCols": decision.measure_cols,
+                "widthMode": decision.width_mode,
+                "minHeightPx": decision.min_height_px,
+                "skeletonVariant": decision.skeleton_variant,
+                "vizParams": decision.viz_params,
+                "dataBindings": decision.data_bindings,
+                "confidence": decision.confidence,
+                "reasoning": decision.reasoning,
+            }
+            if decision
+            else {},
        }


@@ -118,48 +151,74 @@ def sanitize_sql(sql: str, row_limit: int) -> tuple[str, list[str], list[str]]:
        raise ValueError("Oracle SQL agent only accepts SELECT or WITH queries.")
    if DESTRUCTIVE_SQL.search(clean):
        raise ValueError("Oracle SQL agent blocked non-read SQL.")
-    tables = []
+
+    tables: list[str] = []
    for match in TABLE_REF_RE.finditer(clean):
        table = match.group(1).split(".")[-1].strip('"').lower()
        if table in {"lateral", "select"}:
            continue
        if table and table not in tables:
            tables.append(table)
+
+    if "limit" not in clean.lower():
+        clean += f" LIMIT {row_limit}"
+        warnings.append(f"Row cap {row_limit} auto-applied (query had no LIMIT).")
    return clean, tables, warnings


-def infer_component_type(prompt: str, columns: list[str], rows: list[dict[str, Any]]) -> str:
-    lower = prompt.lower()
-    if any(term in lower for term in ("timeline", "conversation", "whatsapp", "message", "call", "email", "history")):
-        return "activity_stream"
-    if len(rows) == 1 and len(columns) <= 5 and any(isinstance(rows[0].get(c), (int, float)) for c in columns):
-        return "kpi_tile"
-    if any(c.endswith("_at") or c in {"date", "when", "timestamp", "happened_at"} for c in columns):
-        if len(rows) > 1 and any(term in lower for term in ("trend", "over time", "timeseries")):
-            return "line_chart"
-        if any(term in lower for term in ("timeline", "activity", "last", "recent")):
-            return "activity_stream"
-    numeric_cols = [c for c in columns if rows and isinstance(rows[0].get(c), (int, float))]
-    if numeric_cols and any(term in lower for term in ("count", "compare", "distribution", "most", "top", "by ")):
-        return "bar_chart"
-    return "table"
+def _detect_intents(prompt: str) -> list[str]:
+    lowered = prompt.lower()
+    intents: list[str] = []

+    if any(token in lowered for token in (
+        "last contact", "last contacted", "recently contacted", "last call",
+        "last message", "last whatsapp", "contacted us", "follow-up", "follow up",
+        "days since", "no contact",
+    )):
+        intents.append("last_contacted")

-def _looks_like_property_rollup_prompt(prompt: str) -> bool:
-    lower = prompt.lower()
-    property_terms = ("property", "properties", "project", "projects")
-    aggregate_terms = ("top", "most", "majority", "highest", "popular", "common")
-    interest_terms = ("interest", "interested", "liked", "preference", "preferences")
-    return (
-        any(term in lower for term in property_terms)
-        and any(term in lower for term in aggregate_terms)
-        and any(term in lower for term in interest_terms)
-    )
+    if any(token in lowered for token in (
+        "interested in", "shown interest", "interest in", "interested clients",
+        "project interest", "property interest",
+    )):
+        intents.append("interested_clients")
+
+    if any(token in lowered for token in ("qd score", "qualification score", "desire score", "intent score", "qd")):
+        intents.append("qd_score")
+
+    if any(token in lowered for token in ("pipeline", "stage", "funnel", "kanban", "deal")):
+        intents.append("pipeline")
+
+    if any(token in lowered for token in ("site visit", "visited", "visit")):
+        intents.append("site_visits")
+
+    if any(token in lowered for token in ("call", "transcript", "whatsapp", "email", "message", "conversation", "interaction", "timeline", "activity")):
+        intents.append("timeline")
+
+    if any(token in lowered for token in ("objection", "concern", "complaint", "pushback")):
+        intents.append("objections")
+
+    if any(token in lowered for token in ("broker", "agent performance", "referral")):
+        intents.append("broker_performance")
+
+    if any(token in lowered for token in ("next action", "next step", "what should i do", "follow-up priority", "action queue")):
+        intents.append("next_action")
+
+    if any(token in lowered for token in ("project", "unit", "inventory", "available", "price", "configuration")):
+        intents.append("inventory")
+
+    if any(token in lowered for token in ("client 360", "dossier", "profile")):
+        intents.append("client_360")
+
+    if any(token in lowered for token in ("fact", "memory", "promise", "commitment", "budget", "preference")):
+        intents.append("extracted_facts")
+
+    return intents or ["last_contacted"]


 def title_from_prompt(prompt: str) -> str:
    words = re.sub(r"\s+", " ", prompt.strip()).strip(" ?.!")
-    return words[:1].upper() + words[1:80] if words else "Oracle Query Result"
+    return (words[:1].upper() + words[1:80]) if words else "Oracle Query Result"


 class NaturalDbAgent:
@@ -187,19 +246,22 @@ class NaturalDbAgent:
                ORDER BY c.table_name, c.ordinal_position
                """
            )
-            counts = {}
+            counts: dict[str, int | None] = {}
            for table in public_tables:
                exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}")
                counts[table] = None if not exists else int(await conn.fetchval(f'SELECT COUNT(*) FROM "{table}"'))
+
            tables: dict[str, dict[str, Any]] = {}
            for row in rows:
                entry = tables.setdefault(row["table_name"], {"columns": [], "rowCount": counts.get(row["table_name"])})
-                entry["columns"].append({
-                    "name": row["column_name"],
-                    "dataType": row["data_type"],
-                    "udtName": row["udt_name"],
-                    "nullable": row["is_nullable"] == "YES",
-                })
+                entry["columns"].append(
+                    {
+                        "name": row["column_name"],
+                        "dataType": row["data_type"],
+                        "udtName": row["udt_name"],
+                        "nullable": row["is_nullable"] == "YES",
+                    }
+                )
            return {"available": True, "tables": tables, "allowedTables": public_tables}
        finally:
            if own_conn:
@@ -228,14 +290,19 @@ class NaturalDbAgent:
        return {
            "counts": counts,
            "expectedSyntheticV2Counts": expected,
-            "missingTables": [t for t, count in counts.items() if count is None],
-            "emptyTables": [t for t, count in counts.items() if count == 0],
-            "belowExpected": {t: {"expected": e, "actual": counts.get(t)} for t, e in expected.items() if (counts.get(t) or 0) < e},
+            "missingTables": [table for table, count in counts.items() if count is None],
+            "emptyTables": [table for table, count in counts.items() if count == 0],
+            "belowExpected": {
+                table: {"expected": expected_count, "actual": counts.get(table)}
+                for table, expected_count in expected.items()
+                if (counts.get(table) or 0) < expected_count
+            },
        }

    async def execute_prompt(self, prompt: str, *, row_limit: int = 100, conn: Any | None = None) -> NaturalQueryResult:
        if not prompt.strip():
            raise ValueError("Prompt is required.")
+
        own_conn = conn is None
        if conn is None:
            if not db_ready():
@@ -243,91 +310,249 @@ class NaturalDbAgent:
            conn = await connect_db()
        try:
            catalog = await self.schema_catalog(conn)
-            plan = await self._plan_sql(prompt, catalog, row_limit)
-            return await self._run_plan(conn, prompt, plan, row_limit)
+            detected_intents = _detect_intents(prompt)
+            return await self._pipeline(
+                conn=conn,
+                prompt=prompt,
+                catalog=catalog,
+                detected_intents=detected_intents,
+                row_limit=row_limit,
+                attempt=0,
+                prior_feedback=None,
+            )
        finally:
            if own_conn:
                await conn.close()

-    async def _run_plan(self, conn: Any, prompt: str, plan: dict[str, Any], row_limit: int) -> NaturalQueryResult:
+    async def _pipeline(
+        self,
+        *,
+        conn: Any,
+        prompt: str,
+        catalog: dict[str, Any],
+        detected_intents: list[str],
+        row_limit: int,
+        attempt: int,
+        prior_feedback: str | None,
+    ) -> NaturalQueryResult:
+        warnings: list[str] = []
+
+        plan = await self._plan_sql(
+            prompt=prompt,
+            catalog=catalog,
+            detected_intents=detected_intents,
+            row_limit=row_limit,
+            prior_feedback=prior_feedback,
+        )
        raw_sql = str(plan.get("sql") or "").strip()
        if not raw_sql:
            raise RuntimeError("Natural SQL planner returned no SQL.")
-        sql, tables, warnings = sanitize_sql(raw_sql, row_limit)
+
+        verification = await plan_verifier.verify_and_repair(
+            sql=raw_sql,
+            prompt=prompt,
+            detected_intents=detected_intents,
+            row_limit=row_limit,
+            llm_service=runtime_llm_service,
+        )
+        if verification.was_repaired:
+            warnings.append(
+                "Plan verifier repaired violations: "
+                + ", ".join(violation.rule for violation in verification.violations if violation.severity == "blocking")
+            )
+        if not verification.passed and verification.repair_failed:
+            warnings.append("Plan verifier found violations but repair failed. Proceeding with original SQL.")
+        if verification.notes:
+            warnings.extend(verification.notes)
+
+        effective_sql, source_tables, sanitize_warnings = sanitize_sql(verification.sql, row_limit)
+        warnings.extend(sanitize_warnings)
+
        try:
-            records = await conn.fetch(sql)
+            records = await conn.fetch(effective_sql)
        except Exception as exc:
            raise RuntimeError(f"Natural SQL execution failed: {exc}") from exc
+
        rows = [_json_safe(dict(record)) for record in records]
        columns = list(rows[0].keys()) if rows else []
-        component_type = infer_component_type(prompt, columns, rows)
+
+        profile = execution_profiler.profile(
+            rows=rows,
+            columns=columns,
+            sql=effective_sql,
+            prompt=prompt,
+            source_tables=source_tables,
+            row_limit=row_limit,
+        )
+
+        if not profile.passed and attempt < _MAX_REPLAN_ATTEMPTS:
+            feedback = " | ".join(profile.replan_hints)
+            warnings.append(f"Auto-replan triggered (attempt {attempt + 1}): {feedback[:160]}")
+            return await self._pipeline(
+                conn=conn,
+                prompt=prompt,
+                catalog=catalog,
+                detected_intents=detected_intents,
+                row_limit=row_limit,
+                attempt=attempt + 1,
+                prior_feedback=feedback,
+            )
+
+        if not profile.passed:
+            for issue in profile.issues:
+                if issue.severity == "blocking":
+                    warnings.append(f"Quality issue after {attempt} replans: [{issue.code}] {issue.description}")
+
+        visualization_decision = visualization_planner.plan(
+            rows=rows,
+            columns=columns,
+            prompt=prompt,
+            source_tables=source_tables,
+            profile_suggested_type=profile.suggested_component_type,
+            title_from_planner=str(plan.get("title") or ""),
+        )
+
+        title = visualization_decision.title or str(plan.get("title") or title_from_prompt(prompt))
+        summary = str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(source_tables) or 'Velocity CRM'}.")
+
        return NaturalQueryResult(
            prompt=prompt,
-            sql=sql,
-            title=str(plan.get("title") or title_from_prompt(prompt)),
-            summary=str(plan.get("rationale") or f"SQL-backed Oracle result from {', '.join(tables) or 'Velocity CRM'}."),
+            sql=effective_sql,
+            title=title,
+            summary=summary,
            columns=columns,
            rows=rows,
            row_count=len(rows),
-            source_tables=tables,
-            component_type=component_type,
+            source_tables=source_tables,
+            component_type=visualization_decision.component_type,
            warnings=warnings,
+            visualization_decision=visualization_decision,
+            replan_count=attempt,
+            semantic_catalog_version=CATALOG_VERSION,
        )

-    async def _plan_sql(self, prompt: str, catalog: dict[str, Any], row_limit: int) -> dict[str, Any]:
+    async def _plan_sql(
+        self,
+        *,
+        prompt: str,
+        catalog: dict[str, Any],
+        detected_intents: list[str],
+        row_limit: int,
+        prior_feedback: str | None = None,
+    ) -> dict[str, Any]:
        try:
            providers = runtime_llm_service._provider_catalog()
        except Exception:
            providers = {}
        if not providers:
-            raise RuntimeError("No runtime LLM providers are configured for Oracle natural planning.")
-        schema_brief = json.dumps(catalog.get("tables", {}), default=str)[:16000]
-        semantic_rules = """
-Velocity SQL semantics:
- QD score means intel_qd_scores.current_value. Do not use crm_people.engagement_score, crm_leads.engagement_score, or intel_interactions.engagement_score as QD.
- For project/property scoped prompts such as "in Atri Surya Toron", "interested in", "for project", or "for property", use crm_property_interests as the primary scoping table.
- Prefer crm_property_interests.project_name for textual project matching. inventory_projects is optional for enrichment, not the primary client-to-project relationship.
- For client lists scoped to a project, join crm_people to crm_property_interests on person_id and filter project_name case-insensitively.
- For lowest/highest/best/worst QD prompts, sort on intel_qd_scores.current_value ASC/DESC as requested.
- Respect the user-requested cardinality exactly when possible. If the prompt says five/top 5/lowest 5, return LIMIT 5.
- When listing clients, include person identity fields from crm_people such as person_id, full_name, primary_phone, and primary_email.
- When aggregating top properties/projects, group by crm_property_interests.project_name and count DISTINCT person_id.
- You may use any table in the public schema that is relevant to the question.
- Use only read-only PostgreSQL SELECT/CTE queries.
-"""
-        system = (
-            "You are Oracle's read-only PostgreSQL planner. Generate one useful SELECT or WITH query "
-            "for the user's CRM question. You have access to the full public schema. Return JSON with sql, title, rationale. "
-            "Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements."
-        )
-        try:
-            response = await runtime_llm_service.chat(
-                provider_id="sglang",
-                model=None,
-                system_prompt=system,
-                messages=[{
+            raise RuntimeError("No runtime LLM providers configured for Oracle natural planning.")
+
+        schema_full = catalog.get("tables", {})
+        relevant_tables = self._relevant_tables_for_intents(detected_intents)
+        schema_brief_dict = {
+            table: meta
+            for table, meta in schema_full.items()
+            if table in relevant_tables or table in {"crm_people", "crm_leads", "inventory_projects", "inventory_units"}
+        }
+        schema_brief = json.dumps(schema_brief_dict, default=str)[:14000]
+        semantic_context = build_semantic_context_for_planner(detected_intents, max_concepts=5)
+
+        replan_section = ""
+        if prior_feedback:
+            replan_section = (
+                f"\n\nPREVIOUS ATTEMPT FAILED - EXECUTION FEEDBACK:\n{prior_feedback}\n"
+                "You must address the feedback and change the query accordingly."
+            )
+
+        response = await runtime_llm_service.chat(
+            provider_id="sglang",
+            model=None,
+            system_prompt=(
+                "You are Oracle's read-only PostgreSQL planner for Project Velocity CRM. "
+                "Use the semantic catalog as the business source of truth, not raw column guessing. "
+                "Generate exactly one SELECT or WITH query. "
+                "Return strict JSON with keys: sql, title, rationale. "
+                "Never generate INSERT, UPDATE, DELETE, DDL, COPY, or permission statements."
+            ),
+            messages=[
+                {
                    "role": "user",
                    "content": (
-                        f"Schema:\n{schema_brief}\n\n"
-                        f"Semantic rules:\n{semantic_rules}\n\n"
-                        f"Question:\n{prompt}\n\n"
-                        f"Row cap: {row_limit}\n\n"
-                        "Return strict JSON with keys: sql, title, rationale."
+                        f"SEMANTIC CATALOG:\n{semantic_context}\n\n"
+                        f"RAW SCHEMA:\n{schema_brief}\n\n"
+                        f"DETECTED INTENTS: {', '.join(detected_intents)}\n\n"
+                        f"USER QUESTION:\n{prompt}\n\n"
+                        f"ROW CAP: {row_limit}\n"
+                        f"{replan_section}\n\n"
+                        "Return strict JSON: {\"sql\": \"...\", \"title\": \"...\", \"rationale\": \"...\"}"
                    ),
-                }],
-                temperature=0.05,
-                response_format="json",
-                metadata={"agent": "oracle_natural_db_agent"},
-            )
-            message = response.get("message") or {}
-            parsed = message.get("parsedJson")
-            content = message.get("content") or "{}"
-            if not isinstance(parsed, dict):
-                parsed = json.loads(content) if isinstance(content, str) else content
-            if isinstance(parsed, dict) and parsed.get("sql"):
-                return parsed
-        except Exception as exc:
-            raise RuntimeError(f"Natural DB planner LLM failed: {exc}") from exc
+                }
+            ],
+            temperature=0.05,
+            response_format="json",
+            metadata={
+                "agent": "oracle_natural_db_agent_v2",
+                "intents": detected_intents,
+                "catalog_version": CATALOG_VERSION,
+            },
+        )
+        message = response.get("message") or {}
+        parsed = message.get("parsedJson")
+        content = message.get("content") or "{}"
+        if not isinstance(parsed, dict):
+            parsed = json.loads(content) if isinstance(content, str) else content
+        if isinstance(parsed, dict) and parsed.get("sql"):
+            return parsed
        raise RuntimeError("Natural DB planner returned no valid SQL.")

+    @staticmethod
+    def _relevant_tables_for_intents(intents: list[str]) -> set[str]:
+        intent_tables: dict[str, set[str]] = {
+            "last_contacted": {
+                "intel_interactions",
+                "crm_people",
+                "crm_leads",
+                "read_last_contacted",
+                "crm_last_contact_read_model",
+            },
+            "interested_clients": {
+                "crm_property_interests",
+                "crm_people",
+                "inventory_projects",
+                "intel_qd_scores",
+            },
+            "qd_score": {"intel_qd_scores", "crm_people"},
+            "pipeline": {"crm_opportunities", "crm_leads", "crm_people", "inventory_projects"},
+            "site_visits": {"intel_visits", "crm_people", "inventory_projects"},
+            "timeline": {
+                "intel_interactions",
+                "intel_calls",
+                "intel_whatsapp_threads",
+                "intel_messages",
+                "intel_emails",
+                "intel_visits",
+                "crm_people",
+            },
+            "objections": {"intel_call_objections", "crm_people", "inventory_projects"},
+            "broker_performance": {"crm_leads", "crm_opportunities", "crm_people"},
+            "next_action": {"read_next_best_action", "crm_people"},
+            "inventory": {"inventory_projects", "inventory_units", "crm_property_interests"},
+            "client_360": {
+                "crm_people",
+                "crm_leads",
+                "intel_qd_scores",
+                "crm_property_interests",
+                "crm_opportunities",
+                "intel_interactions",
+                "read_last_contacted",
+                "read_next_best_action",
+            },
+            "extracted_facts": {"intel_extracted_facts", "crm_people"},
+        }
+        tables: set[str] = set()
+        for intent in intents:
+            tables.update(intent_tables.get(intent, set()))
+        return tables
+
+
 natural_db_agent = NaturalDbAgent()