""" oracle/codebook_service.py Loads, normalizes, and retrieves Oracle Canvas codebook examples from the expanded GPT and Claude seed packs delivered in Sprint 1. The runtime treats the GPT pack as the primary normalized corpus and uses the Claude pack as a supplement when it adds unique examples or metadata. """ from __future__ import annotations import hashlib import json import logging import re from dataclasses import dataclass from functools import lru_cache from pathlib import Path from typing import Any logger = logging.getLogger(__name__) _TOKEN_RE = re.compile(r"[a-z0-9]+") _STOPWORDS = { "a", "an", "and", "as", "at", "build", "canvas", "chart", "client", "clients", "for", "from", "get", "give", "in", "into", "is", "list", "me", "of", "on", "or", "oracle", "please", "render", "show", "surface", "that", "the", "this", "to", "view", "with", } @dataclass(frozen=True) class CodebookExample: example_id: str chapter_id: str chapter_name: str subchapter_id: str subchapter_name: str title: str template_name: str component_type: str accepted_shapes: tuple[str, ...] example_json: dict[str, Any] quality_notes: str is_canonical: bool source_pack: str surface_targets: tuple[str, ...] policy_tags: tuple[str, ...] backend_contract_hints: dict[str, Any] score_terms: tuple[str, ...] def _repo_root() -> Path: return Path(__file__).resolve().parents[2] def _safe_load_json(path: Path) -> dict[str, Any]: with path.open("r", encoding="utf-8") as handle: return json.load(handle) def _tokenize(value: str) -> list[str]: lowered = value.lower() return [tok for tok in _TOKEN_RE.findall(lowered) if tok not in _STOPWORDS and len(tok) > 1] def _make_template_id(example: dict[str, Any]) -> str: base = "|".join( [ example.get("chapter_id", ""), example.get("subchapter_id", ""), example.get("template_name", ""), example.get("component_type", ""), ] ) return hashlib.sha1(base.encode("utf-8")).hexdigest()[:16] def _chapter_maps(payload: dict[str, Any]) -> tuple[dict[str, str], dict[str, str]]: chapters: dict[str, str] = {} subchapters: dict[str, str] = {} for chapter in payload.get("chapters", []): chapter_id = str(chapter.get("chapter_id", "")).strip() if chapter_id: chapters[chapter_id] = str(chapter.get("name", "")).strip() for subchapter in chapter.get("subchapters", []): sub_id = str(subchapter.get("subchapter_id", "")).strip() if sub_id: subchapters[sub_id] = str(subchapter.get("name", "")).strip() return chapters, subchapters def _normalize_examples(payload: dict[str, Any], source_pack: str) -> list[CodebookExample]: chapter_names, subchapter_names = _chapter_maps(payload) raw_examples = payload.get("seed_examples") or payload.get("examples") or [] normalized: list[CodebookExample] = [] for raw in raw_examples: chapter_id = str(raw.get("chapter_id", "")).strip() subchapter_id = str(raw.get("subchapter_id", "")).strip() title = str(raw.get("title") or raw.get("template_name") or "Oracle Component").strip() template_name = str(raw.get("template_name") or title).strip() component_type = str(raw.get("component_type") or "summary_card").strip() example_json = raw.get("example_json") or {} terms = _tokenize( " ".join( [ title, template_name, component_type.replace("_", " "), chapter_names.get(chapter_id, ""), subchapter_names.get(subchapter_id, ""), str(raw.get("quality_notes", "")), " ".join(raw.get("policy_tags", []) or []), ] ) ) normalized.append( CodebookExample( example_id=str(raw.get("example_id") or _make_template_id(raw)), chapter_id=chapter_id, chapter_name=chapter_names.get(chapter_id, chapter_id), subchapter_id=subchapter_id, subchapter_name=subchapter_names.get(subchapter_id, subchapter_id), title=title, template_name=template_name, component_type=component_type, accepted_shapes=tuple(raw.get("accepted_shapes") or []), example_json=example_json, quality_notes=str(raw.get("quality_notes") or ""), is_canonical=bool(raw.get("is_canonical")), source_pack=source_pack, surface_targets=tuple(raw.get("surface_targets") or []), policy_tags=tuple(raw.get("policy_tags") or []), backend_contract_hints=dict(raw.get("backend_contract_hints") or {}), score_terms=tuple(terms), ) ) return normalized class OracleCodebookService: def __init__(self) -> None: root = _repo_root() self.runtime_merged_path = root / "backend" / "oracle" / "oracle_runtime_codebook_merged.json" self.primary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "GPT 5.4" / "oracle_canvas_json_expansion_pack" / "db" / "oracle_template_seed_db_expanded_v1.pretty.json" self.secondary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "Claude Sonnet 4.6" / "oracle_template_expansion" / "oracle_template_seed_db_expanded.json" self.fallback_path = root / "backend" / "oracle" / "oracle_template_seed_db.json" @lru_cache(maxsize=1) def load(self) -> dict[str, Any]: corpora: list[CodebookExample] = [] sources_loaded: list[str] = [] source_paths: list[tuple[Path, str]] if self.runtime_merged_path.exists(): source_paths = [ (self.runtime_merged_path, "runtime_merged"), (self.fallback_path, "runtime_seed_fallback"), ] else: source_paths = [ (self.primary_path, "gpt_5_4"), (self.secondary_path, "claude_sonnet_4_6"), (self.fallback_path, "runtime_seed_fallback"), ] for path, label in source_paths: if not path.exists(): continue payload = _safe_load_json(path) examples = _normalize_examples(payload, label) if examples: corpora.extend(examples) sources_loaded.append(f"{label}:{len(examples)}") deduped: dict[tuple[str, str, str], CodebookExample] = {} for example in corpora: key = (example.subchapter_id, example.template_name.lower(), example.title.lower()) existing = deduped.get(key) if existing is None: deduped[key] = example continue # Prefer canonical GPT examples, then canonical examples, then richer source pack. if example.source_pack == "gpt_5_4" and existing.source_pack != "gpt_5_4": deduped[key] = example elif example.is_canonical and not existing.is_canonical: deduped[key] = example examples = list(deduped.values()) logger.info("Oracle codebook loaded from %s", ", ".join(sources_loaded) or "no sources") return { "examples": examples, "source_summary": sources_loaded, "template_count": len({(e.chapter_id, e.subchapter_id, e.template_name, e.component_type) for e in examples}), } def stats(self) -> dict[str, Any]: data = self.load() examples: list[CodebookExample] = data["examples"] return { "example_count": len(examples), "template_count": data["template_count"], "source_summary": data["source_summary"], } def list_templates( self, *, category: str | None = None, status: str | None = None, search: str | None = None, limit: int = 50, offset: int = 0, ) -> dict[str, Any]: del status # runtime codebook templates are always active catalog entries examples: list[CodebookExample] = self.load()["examples"] templates: dict[str, dict[str, Any]] = {} for example in examples: if category and category.lower() not in {example.chapter_name.lower(), example.subchapter_name.lower()}: continue if search: terms = set(example.score_terms) if not set(_tokenize(search)).intersection(terms): continue template_id = _make_template_id( { "chapter_id": example.chapter_id, "subchapter_id": example.subchapter_id, "template_name": example.template_name, "component_type": example.component_type, } ) record = templates.get(template_id) if record is None: templates[template_id] = { "templateId": template_id, "tenantId": "_system", "name": example.template_name, "category": example.chapter_name, "status": "catalog_active", "origin": "premade", "version": "codebook-v2", "acceptedShapes": list(example.accepted_shapes), "description": f"{example.subchapter_name} · {example.title}", "chapterId": example.chapter_id, "subchapterId": example.subchapter_id, "componentType": example.component_type, "sourcePack": example.source_pack, "useCount": 0, "updatedAt": None, "createdAt": None, } ordered = list(templates.values()) ordered.sort(key=lambda item: (item["category"], item["name"])) total = len(ordered) return { "total": total, "templates": ordered[offset: offset + limit], } def search_examples(self, prompt: str, *, limit: int = 8) -> list[CodebookExample]: prompt_terms = set(_tokenize(prompt)) if not prompt_terms: prompt_terms = set(_tokenize(prompt.replace("_", " "))) lowered_prompt = prompt.lower() crm_prompt = any(term in lowered_prompt for term in ("client", "clients", "contact", "contacts", "crm", "lead", "account")) interaction_prompt = any(term in lowered_prompt for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up")) property_prompt = any(term in lowered_prompt for term in ("property", "properties", "project", "projects", "interest", "interested")) scored: list[tuple[int, CodebookExample]] = [] for example in self.load()["examples"]: score = 0 term_set = set(example.score_terms) overlap = prompt_terms.intersection(term_set) score += len(overlap) * 6 if example.template_name.lower() in lowered_prompt: score += 24 if example.subchapter_name.lower() in lowered_prompt: score += 20 if example.chapter_name.lower() in lowered_prompt: score += 14 if example.component_type.replace("_", " ") in lowered_prompt: score += 12 if example.is_canonical: score += 8 if "live_data_first" in example.policy_tags: score += 4 chapter = example.chapter_name.lower() subchapter = example.subchapter_name.lower() title = example.title.lower() if crm_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("lead", "client", "contact", "crm", "account", "pipeline")): score += 18 if interaction_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up")): score += 16 if property_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("property", "inventory", "interest", "project")): score += 16 if score > 0: scored.append((score, example)) scored.sort(key=lambda item: (-item[0], item[1].chapter_id, item[1].subchapter_id, item[1].title)) selected: list[CodebookExample] = [] seen: set[tuple[str, str]] = set() for _, example in scored: dedupe_key = (example.subchapter_id, example.template_name) if dedupe_key in seen: continue seen.add(dedupe_key) selected.append(example) if len(selected) >= limit: break return selected def synthesize_template(self, prompt: str, data_shapes: list[str] | None = None) -> dict[str, Any]: match = next(iter(self.search_examples(prompt, limit=1)), None) shapes = data_shapes or [] if match is None: return { "templateId": hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:16], "tenantId": "_system", "name": "Oracle Synthesized Draft", "category": "Custom", "status": "tenant_draft", "origin": "synthesized", "version": "1.0.0", "acceptedShapes": shapes, "description": f"Draft synthesized from prompt: {prompt[:120]}", } return { "templateId": _make_template_id( { "chapter_id": match.chapter_id, "subchapter_id": match.subchapter_id, "template_name": match.template_name, "component_type": match.component_type, } ), "tenantId": "_system", "name": match.template_name, "category": match.chapter_name, "status": "catalog_active", "origin": "premade", "version": "codebook-v2", "acceptedShapes": list(match.accepted_shapes or shapes), "description": f"Best codebook match · {match.subchapter_name}", "componentType": match.component_type, "chapterId": match.chapter_id, "subchapterId": match.subchapter_id, "sourcePack": match.source_pack, "exampleJson": match.example_json, } codebook_service = OracleCodebookService()