fix: Oracle Canvas Metadata and deterministic semantic repair

2026-04-24 15:44:00 +05:30
parent 8d41ba5549
commit 61258978e1
4 changed files with 568 additions and 14 deletions
--- a/backend/oracle/semantic_catalog.py
+++ b/backend/oracle/semantic_catalog.py
@@ -29,6 +29,8 @@ class FieldDescriptor:
    confidence: str
    description: str
    notes: str = ""
+    valid_values: tuple[str, ...] = ()
+    examples: tuple[str, ...] = ()


@dataclass(frozen=True)
@@ -54,6 +56,115 @@ class ConceptDescriptor:

 CATALOG_VERSION = "velocity_semantic_v2026_04_25_01"

+
+@dataclass(frozen=True)
+class ColumnMetadata:
+    table: str
+    column: str
+    topic: str
+    meaning: str
+    reliability: str
+    valid_values: tuple[str, ...] = ()
+    examples: tuple[str, ...] = ()
+    usage: str = ""
+    avoid: bool = False
+
+
+VALID_QD_SCORE_TYPES: tuple[str, ...] = (
+    "overall",
+    "intent",
+    "engagement",
+    "urgency",
+    "financial_qualification",
+)
+
+
+COLUMN_METADATA: list[ColumnMetadata] = [
+    ColumnMetadata(
+        "intel_qd_scores",
+        "score_type",
+        "qd_score",
+        "Score family/category. There is no score_type value named QD.",
+        Confidence.RELIABLE,
+        valid_values=VALID_QD_SCORE_TYPES,
+        examples=("overall", "intent", "engagement"),
+        usage=(
+            "For generic QD score prompts, prefer score_type = 'overall'. "
+            "For specific intent/engagement/urgency/financial prompts, use the matching valid value. "
+            "Never filter score_type = 'QD'."
+        ),
+    ),
+    ColumnMetadata(
+        "intel_qd_scores",
+        "current_value",
+        "qd_score",
+        "Authoritative numeric score value for the selected score_type.",
+        Confidence.RELIABLE,
+        examples=("98.0", "72.4"),
+        usage="Rank, sort, average, or threshold QD-style scores with this column.",
+    ),
+    ColumnMetadata(
+        "intel_qd_scores",
+        "computed_at",
+        "qd_score",
+        "Timestamp when the score was computed.",
+        Confidence.RELIABLE,
+        examples=("2026-04-18T00:00:00"),
+        usage="Use for score freshness, not client contact recency.",
+    ),
+    ColumnMetadata(
+        "intel_interactions",
+        "happened_at",
+        "contact_recency",
+        "Primary timestamp for client contact and interaction recency.",
+        Confidence.RELIABLE,
+        usage="Use for contacted, last contacted, recent contact, activity, and timeline prompts.",
+    ),
+    ColumnMetadata(
+        "read_last_contacted",
+        "last_contact_at",
+        "contact_recency",
+        "Precomputed per-client last contact timestamp.",
+        Confidence.RELIABLE,
+        usage="Prefer for client-level last-contact summaries when this read model is available.",
+    ),
+    ColumnMetadata(
+        "edge_communication_events",
+        "timestamp",
+        "contact_recency",
+        "Legacy/sparse event timestamp that is not reliable for Oracle CRM recency.",
+        Confidence.SPARSE,
+        usage="Do not use for contact prompts.",
+        avoid=True,
+    ),
+    ColumnMetadata(
+        "crm_property_interests",
+        "last_discussed_at",
+        "contact_recency",
+        "Sparse legacy field; property interest does not prove recent contact.",
+        Confidence.SPARSE,
+        usage="Do not use as the primary recency filter.",
+        avoid=True,
+    ),
+    ColumnMetadata(
+        "crm_property_interests",
+        "project_name",
+        "property_interest",
+        "Human-readable project/property name attached to a client's interest.",
+        Confidence.RELIABLE,
+        examples=("Atri Surya Toron", "Godrej Elevate"),
+        usage="Use ILIKE filters for property/project scoped prompts.",
+    ),
+    ColumnMetadata(
+        "crm_property_interests",
+        "interest_level",
+        "property_interest",
+        "Interest strength label or score imported from CRM enrichment.",
+        Confidence.RELIABLE,
+        usage="Use with project_name and person_id to rank interested clients or properties.",
+    ),
+]
+
 CONCEPTS: list[ConceptDescriptor] = [
    ConceptDescriptor(
        concept_id="person_identity",
@@ -95,7 +206,14 @@ CONCEPTS: list[ConceptDescriptor] = [
        authoritative_fields=[
            FieldDescriptor("intel_qd_scores", "person_id", Confidence.RELIABLE, "FK to crm_people"),
            FieldDescriptor("intel_qd_scores", "current_value", Confidence.RELIABLE, "Authoritative QD score"),
-            FieldDescriptor("intel_qd_scores", "score_type", Confidence.RELIABLE, "Score family"),
+            FieldDescriptor(
+                "intel_qd_scores",
+                "score_type",
+                Confidence.RELIABLE,
+                "Score family",
+                notes="Valid values are overall, intent, engagement, urgency, financial_qualification. There is no value named QD.",
+                valid_values=VALID_QD_SCORE_TYPES,
+            ),
            FieldDescriptor("intel_qd_scores", "computed_at", Confidence.RELIABLE, "Score timestamp"),
        ],
        deprecated_fields=[
@@ -105,7 +223,9 @@ CONCEPTS: list[ConceptDescriptor] = [
        ],
        usage_notes=(
            "When a prompt mentions QD, qualification, desire, or intent score, "
-            "use intel_qd_scores.current_value. Do not substitute engagement_score."
+            "use intel_qd_scores.current_value. Do not substitute engagement_score. "
+            "Do not filter score_type = 'QD'. For generic QD prompts, use score_type = 'overall'. "
+            "Use intent, engagement, urgency, or financial_qualification only when the prompt asks for that specific family."
        ),
    ),
    ConceptDescriptor(
@@ -141,10 +261,10 @@ CONCEPTS: list[ConceptDescriptor] = [
        description="Per-person last-contact summary materialization.",
        authoritative_fields=[
            FieldDescriptor("read_last_contacted", "person_id", Confidence.RELIABLE, "FK to crm_people"),
-            FieldDescriptor("read_last_contacted", "last_contacted_at", Confidence.RELIABLE, "Last contact time"),
+            FieldDescriptor("read_last_contacted", "last_contact_at", Confidence.RELIABLE, "Last contact time"),
            FieldDescriptor("read_last_contacted", "last_channel", Confidence.RELIABLE, "Last contact channel"),
-            FieldDescriptor("read_last_contacted", "days_since_last_contact", Confidence.RELIABLE, "Recency in days"),
-            FieldDescriptor("read_last_contacted", "staleness_label", Confidence.RELIABLE, "Hot/warm/cold bucket"),
+            FieldDescriptor("read_last_contacted", "days_since_contact", Confidence.RELIABLE, "Recency in days"),
+            FieldDescriptor("read_last_contacted", "interactions_last_90d", Confidence.RELIABLE, "Recent interaction volume"),
        ],
        deprecated_fields=[
            FieldDescriptor("crm_property_interests", "last_discussed_at", Confidence.DEPRECATED, "Stale field"),
@@ -318,6 +438,8 @@ def _field_to_dict(field: FieldDescriptor) -> dict[str, Any]:
        "confidence": field.confidence,
        "description": field.description,
        **({"notes": field.notes} if field.notes else {}),
+        **({"valid_values": list(field.valid_values)} if field.valid_values else {}),
+        **({"examples": list(field.examples)} if field.examples else {}),
    }


@@ -351,10 +473,40 @@ def build_semantic_context_for_planner(detected_intents: list[str], *, max_conce
            if concept.concept_id not in seen:
                seen.add(concept.concept_id)
                ordered.append(concept)
+    relevant_topics = set(detected_intents)
+    if "last_contacted" in relevant_topics or "timeline" in relevant_topics:
+        relevant_topics.add("contact_recency")
+    if "interested_clients" in relevant_topics or "inventory" in relevant_topics:
+        relevant_topics.add("property_interest")
+    if "qd_score" in relevant_topics:
+        relevant_topics.add("qd_score")
+
+    column_metadata = [
+        {
+            "table": item.table,
+            "column": item.column,
+            "topic": item.topic,
+            "meaning": item.meaning,
+            "reliability": item.reliability,
+            **({"valid_values": list(item.valid_values)} if item.valid_values else {}),
+            **({"examples": list(item.examples)} if item.examples else {}),
+            **({"usage": item.usage} if item.usage else {}),
+            **({"avoid": item.avoid} if item.avoid else {}),
+        }
+        for item in COLUMN_METADATA
+        if item.topic in relevant_topics or item.avoid
+    ]
    return json.dumps(
        {
            "catalog_version": CATALOG_VERSION,
            "concepts": [concept_to_dict(concept) for concept in ordered[:max_concepts]],
+            "column_metadata": column_metadata,
+            "global_rules": [
+                "Do not invent enum values. Use only valid_values from column_metadata when filtering enum-like columns.",
+                "Queries that return zero rows because of impossible enum filters are invalid plans.",
+                "For contact recency, use read_last_contacted.last_contact_at or intel_interactions.happened_at.",
+                "Do not use fields marked avoid=true for the main business filter.",
+            ],
        },
        separators=(",", ":"),
    )