Project_Velocity/backend/oracle/codebook_service.py

"""
oracle/codebook_service.py
Loads, normalizes, and retrieves Oracle Canvas codebook examples from the
expanded GPT and Claude seed packs delivered in Sprint 1.

The runtime treats the GPT pack as the primary normalized corpus and uses the
Claude pack as a supplement when it adds unique examples or metadata.
"""
from __future__ import annotations

import hashlib
import json
import logging
import re
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

_TOKEN_RE = re.compile(r"[a-z0-9]+")
_STOPWORDS = {
    "a", "an", "and", "as", "at", "build", "canvas", "chart", "client", "clients",
    "for", "from", "get", "give", "in", "into", "is", "list", "me", "of", "on",
    "or", "oracle", "please", "render", "show", "surface", "that", "the", "this",
    "to", "view", "with",
}


@dataclass(frozen=True)
class CodebookExample:
    example_id: str
    chapter_id: str
    chapter_name: str
    subchapter_id: str
    subchapter_name: str
    title: str
    template_name: str
    component_type: str
    accepted_shapes: tuple[str, ...]
    example_json: dict[str, Any]
    quality_notes: str
    is_canonical: bool
    source_pack: str
    surface_targets: tuple[str, ...]
    policy_tags: tuple[str, ...]
    backend_contract_hints: dict[str, Any]
    score_terms: tuple[str, ...]


def _repo_root() -> Path:
    return Path(__file__).resolve().parents[2]


def _safe_load_json(path: Path) -> dict[str, Any]:
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)


def _tokenize(value: str) -> list[str]:
    lowered = value.lower()
    return [tok for tok in _TOKEN_RE.findall(lowered) if tok not in _STOPWORDS and len(tok) > 1]


def _make_template_id(example: dict[str, Any]) -> str:
    base = "|".join(
        [
            example.get("chapter_id", ""),
            example.get("subchapter_id", ""),
            example.get("template_name", ""),
            example.get("component_type", ""),
        ]
    )
    return hashlib.sha1(base.encode("utf-8")).hexdigest()[:16]


def _chapter_maps(payload: dict[str, Any]) -> tuple[dict[str, str], dict[str, str]]:
    chapters: dict[str, str] = {}
    subchapters: dict[str, str] = {}
    for chapter in payload.get("chapters", []):
        chapter_id = str(chapter.get("chapter_id", "")).strip()
        if chapter_id:
            chapters[chapter_id] = str(chapter.get("name", "")).strip()
        for subchapter in chapter.get("subchapters", []):
            sub_id = str(subchapter.get("subchapter_id", "")).strip()
            if sub_id:
                subchapters[sub_id] = str(subchapter.get("name", "")).strip()
    return chapters, subchapters


def _normalize_examples(payload: dict[str, Any], source_pack: str) -> list[CodebookExample]:
    chapter_names, subchapter_names = _chapter_maps(payload)
    raw_examples = payload.get("seed_examples") or payload.get("examples") or []
    normalized: list[CodebookExample] = []
    for raw in raw_examples:
        chapter_id = str(raw.get("chapter_id", "")).strip()
        subchapter_id = str(raw.get("subchapter_id", "")).strip()
        title = str(raw.get("title") or raw.get("template_name") or "Oracle Component").strip()
        template_name = str(raw.get("template_name") or title).strip()
        component_type = str(raw.get("component_type") or "summary_card").strip()
        example_json = raw.get("example_json") or {}
        terms = _tokenize(
            " ".join(
                [
                    title,
                    template_name,
                    component_type.replace("_", " "),
                    chapter_names.get(chapter_id, ""),
                    subchapter_names.get(subchapter_id, ""),
                    str(raw.get("quality_notes", "")),
                    " ".join(raw.get("policy_tags", []) or []),
                ]
            )
        )
        normalized.append(
            CodebookExample(
                example_id=str(raw.get("example_id") or _make_template_id(raw)),
                chapter_id=chapter_id,
                chapter_name=chapter_names.get(chapter_id, chapter_id),
                subchapter_id=subchapter_id,
                subchapter_name=subchapter_names.get(subchapter_id, subchapter_id),
                title=title,
                template_name=template_name,
                component_type=component_type,
                accepted_shapes=tuple(raw.get("accepted_shapes") or []),
                example_json=example_json,
                quality_notes=str(raw.get("quality_notes") or ""),
                is_canonical=bool(raw.get("is_canonical")),
                source_pack=source_pack,
                surface_targets=tuple(raw.get("surface_targets") or []),
                policy_tags=tuple(raw.get("policy_tags") or []),
                backend_contract_hints=dict(raw.get("backend_contract_hints") or {}),
                score_terms=tuple(terms),
            )
        )
    return normalized


class OracleCodebookService:
    def __init__(self) -> None:
        root = _repo_root()
        self.runtime_merged_path = root / "backend" / "oracle" / "oracle_runtime_codebook_merged.json"
        self.primary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "GPT 5.4" / "oracle_canvas_json_expansion_pack" / "db" / "oracle_template_seed_db_expanded_v1.pretty.json"
        self.secondary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "Claude Sonnet 4.6" / "oracle_template_expansion" / "oracle_template_seed_db_expanded.json"
        self.fallback_path = root / "backend" / "oracle" / "oracle_template_seed_db.json"

    @lru_cache(maxsize=1)
    def load(self) -> dict[str, Any]:
        corpora: list[CodebookExample] = []
        sources_loaded: list[str] = []
        source_paths: list[tuple[Path, str]]
        if self.runtime_merged_path.exists():
            source_paths = [
                (self.runtime_merged_path, "runtime_merged"),
                (self.fallback_path, "runtime_seed_fallback"),
            ]
        else:
            source_paths = [
                (self.primary_path, "gpt_5_4"),
                (self.secondary_path, "claude_sonnet_4_6"),
                (self.fallback_path, "runtime_seed_fallback"),
            ]

        for path, label in source_paths:
            if not path.exists():
                continue
            payload = _safe_load_json(path)
            examples = _normalize_examples(payload, label)
            if examples:
                corpora.extend(examples)
                sources_loaded.append(f"{label}:{len(examples)}")

        deduped: dict[tuple[str, str, str], CodebookExample] = {}
        for example in corpora:
            key = (example.subchapter_id, example.template_name.lower(), example.title.lower())
            existing = deduped.get(key)
            if existing is None:
                deduped[key] = example
                continue
            # Prefer canonical GPT examples, then canonical examples, then richer source pack.
            if example.source_pack == "gpt_5_4" and existing.source_pack != "gpt_5_4":
                deduped[key] = example
            elif example.is_canonical and not existing.is_canonical:
                deduped[key] = example

        examples = list(deduped.values())
        logger.info("Oracle codebook loaded from %s", ", ".join(sources_loaded) or "no sources")
        return {
            "examples": examples,
            "source_summary": sources_loaded,
            "template_count": len({(e.chapter_id, e.subchapter_id, e.template_name, e.component_type) for e in examples}),
        }

    def stats(self) -> dict[str, Any]:
        data = self.load()
        examples: list[CodebookExample] = data["examples"]
        return {
            "example_count": len(examples),
            "template_count": data["template_count"],
            "source_summary": data["source_summary"],
        }

    def list_templates(
        self,
        *,
        category: str | None = None,
        status: str | None = None,
        search: str | None = None,
        limit: int = 50,
        offset: int = 0,
    ) -> dict[str, Any]:
        del status  # runtime codebook templates are always active catalog entries
        examples: list[CodebookExample] = self.load()["examples"]
        templates: dict[str, dict[str, Any]] = {}
        for example in examples:
            if category and category.lower() not in {example.chapter_name.lower(), example.subchapter_name.lower()}:
                continue
            if search:
                terms = set(example.score_terms)
                if not set(_tokenize(search)).intersection(terms):
                    continue
            template_id = _make_template_id(
                {
                    "chapter_id": example.chapter_id,
                    "subchapter_id": example.subchapter_id,
                    "template_name": example.template_name,
                    "component_type": example.component_type,
                }
            )
            record = templates.get(template_id)
            if record is None:
                templates[template_id] = {
                    "templateId": template_id,
                    "tenantId": "_system",
                    "name": example.template_name,
                    "category": example.chapter_name,
                    "status": "catalog_active",
                    "origin": "premade",
                    "version": "codebook-v2",
                    "acceptedShapes": list(example.accepted_shapes),
                    "description": f"{example.subchapter_name} · {example.title}",
                    "chapterId": example.chapter_id,
                    "subchapterId": example.subchapter_id,
                    "componentType": example.component_type,
                    "sourcePack": example.source_pack,
                    "useCount": 0,
                    "updatedAt": None,
                    "createdAt": None,
                }
        ordered = list(templates.values())
        ordered.sort(key=lambda item: (item["category"], item["name"]))
        total = len(ordered)
        return {
            "total": total,
            "templates": ordered[offset: offset + limit],
        }

    def search_examples(self, prompt: str, *, limit: int = 8) -> list[CodebookExample]:
        prompt_terms = set(_tokenize(prompt))
        if not prompt_terms:
            prompt_terms = set(_tokenize(prompt.replace("_", " ")))

        lowered_prompt = prompt.lower()
        crm_prompt = any(term in lowered_prompt for term in ("client", "clients", "contact", "contacts", "crm", "lead", "account"))
        interaction_prompt = any(term in lowered_prompt for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up"))
        property_prompt = any(term in lowered_prompt for term in ("property", "properties", "project", "projects", "interest", "interested"))

        scored: list[tuple[int, CodebookExample]] = []
        for example in self.load()["examples"]:
            score = 0
            term_set = set(example.score_terms)
            overlap = prompt_terms.intersection(term_set)
            score += len(overlap) * 6
            if example.template_name.lower() in lowered_prompt:
                score += 24
            if example.subchapter_name.lower() in lowered_prompt:
                score += 20
            if example.chapter_name.lower() in lowered_prompt:
                score += 14
            if example.component_type.replace("_", " ") in lowered_prompt:
                score += 12
            if example.is_canonical:
                score += 8
            if "live_data_first" in example.policy_tags:
                score += 4
            chapter = example.chapter_name.lower()
            subchapter = example.subchapter_name.lower()
            title = example.title.lower()
            if crm_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("lead", "client", "contact", "crm", "account", "pipeline")):
                score += 18
            if interaction_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up")):
                score += 16
            if property_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("property", "inventory", "interest", "project")):
                score += 16
            if score > 0:
                scored.append((score, example))

        scored.sort(key=lambda item: (-item[0], item[1].chapter_id, item[1].subchapter_id, item[1].title))
        selected: list[CodebookExample] = []
        seen: set[tuple[str, str]] = set()
        for _, example in scored:
            dedupe_key = (example.subchapter_id, example.template_name)
            if dedupe_key in seen:
                continue
            seen.add(dedupe_key)
            selected.append(example)
            if len(selected) >= limit:
                break
        return selected

    def synthesize_template(self, prompt: str, data_shapes: list[str] | None = None) -> dict[str, Any]:
        match = next(iter(self.search_examples(prompt, limit=1)), None)
        shapes = data_shapes or []
        if match is None:
            return {
                "templateId": hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:16],
                "tenantId": "_system",
                "name": "Oracle Synthesized Draft",
                "category": "Custom",
                "status": "tenant_draft",
                "origin": "synthesized",
                "version": "1.0.0",
                "acceptedShapes": shapes,
                "description": f"Draft synthesized from prompt: {prompt[:120]}",
            }

        return {
            "templateId": _make_template_id(
                {
                    "chapter_id": match.chapter_id,
                    "subchapter_id": match.subchapter_id,
                    "template_name": match.template_name,
                    "component_type": match.component_type,
                }
            ),
            "tenantId": "_system",
            "name": match.template_name,
            "category": match.chapter_name,
            "status": "catalog_active",
            "origin": "premade",
            "version": "codebook-v2",
            "acceptedShapes": list(match.accepted_shapes or shapes),
            "description": f"Best codebook match · {match.subchapter_name}",
            "componentType": match.component_type,
            "chapterId": match.chapter_id,
            "subchapterId": match.subchapter_id,
            "sourcePack": match.source_pack,
            "exampleJson": match.example_json,
        }


codebook_service = OracleCodebookService()