354 lines
15 KiB
Python
354 lines
15 KiB
Python
"""
|
|
oracle/codebook_service.py
|
|
Loads, normalizes, and retrieves Oracle Canvas codebook examples from the
|
|
expanded GPT and Claude seed packs delivered in Sprint 1.
|
|
|
|
The runtime treats the GPT pack as the primary normalized corpus and uses the
|
|
Claude pack as a supplement when it adds unique examples or metadata.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
from functools import lru_cache
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
|
_STOPWORDS = {
|
|
"a", "an", "and", "as", "at", "build", "canvas", "chart", "client", "clients",
|
|
"for", "from", "get", "give", "in", "into", "is", "list", "me", "of", "on",
|
|
"or", "oracle", "please", "render", "show", "surface", "that", "the", "this",
|
|
"to", "view", "with",
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class CodebookExample:
|
|
example_id: str
|
|
chapter_id: str
|
|
chapter_name: str
|
|
subchapter_id: str
|
|
subchapter_name: str
|
|
title: str
|
|
template_name: str
|
|
component_type: str
|
|
accepted_shapes: tuple[str, ...]
|
|
example_json: dict[str, Any]
|
|
quality_notes: str
|
|
is_canonical: bool
|
|
source_pack: str
|
|
surface_targets: tuple[str, ...]
|
|
policy_tags: tuple[str, ...]
|
|
backend_contract_hints: dict[str, Any]
|
|
score_terms: tuple[str, ...]
|
|
|
|
|
|
def _repo_root() -> Path:
|
|
return Path(__file__).resolve().parents[2]
|
|
|
|
|
|
def _safe_load_json(path: Path) -> dict[str, Any]:
|
|
with path.open("r", encoding="utf-8") as handle:
|
|
return json.load(handle)
|
|
|
|
|
|
def _tokenize(value: str) -> list[str]:
|
|
lowered = value.lower()
|
|
return [tok for tok in _TOKEN_RE.findall(lowered) if tok not in _STOPWORDS and len(tok) > 1]
|
|
|
|
|
|
def _make_template_id(example: dict[str, Any]) -> str:
|
|
base = "|".join(
|
|
[
|
|
example.get("chapter_id", ""),
|
|
example.get("subchapter_id", ""),
|
|
example.get("template_name", ""),
|
|
example.get("component_type", ""),
|
|
]
|
|
)
|
|
return hashlib.sha1(base.encode("utf-8")).hexdigest()[:16]
|
|
|
|
|
|
def _chapter_maps(payload: dict[str, Any]) -> tuple[dict[str, str], dict[str, str]]:
|
|
chapters: dict[str, str] = {}
|
|
subchapters: dict[str, str] = {}
|
|
for chapter in payload.get("chapters", []):
|
|
chapter_id = str(chapter.get("chapter_id", "")).strip()
|
|
if chapter_id:
|
|
chapters[chapter_id] = str(chapter.get("name", "")).strip()
|
|
for subchapter in chapter.get("subchapters", []):
|
|
sub_id = str(subchapter.get("subchapter_id", "")).strip()
|
|
if sub_id:
|
|
subchapters[sub_id] = str(subchapter.get("name", "")).strip()
|
|
return chapters, subchapters
|
|
|
|
|
|
def _normalize_examples(payload: dict[str, Any], source_pack: str) -> list[CodebookExample]:
|
|
chapter_names, subchapter_names = _chapter_maps(payload)
|
|
raw_examples = payload.get("seed_examples") or payload.get("examples") or []
|
|
normalized: list[CodebookExample] = []
|
|
for raw in raw_examples:
|
|
chapter_id = str(raw.get("chapter_id", "")).strip()
|
|
subchapter_id = str(raw.get("subchapter_id", "")).strip()
|
|
title = str(raw.get("title") or raw.get("template_name") or "Oracle Component").strip()
|
|
template_name = str(raw.get("template_name") or title).strip()
|
|
component_type = str(raw.get("component_type") or "summary_card").strip()
|
|
example_json = raw.get("example_json") or {}
|
|
terms = _tokenize(
|
|
" ".join(
|
|
[
|
|
title,
|
|
template_name,
|
|
component_type.replace("_", " "),
|
|
chapter_names.get(chapter_id, ""),
|
|
subchapter_names.get(subchapter_id, ""),
|
|
str(raw.get("quality_notes", "")),
|
|
" ".join(raw.get("policy_tags", []) or []),
|
|
]
|
|
)
|
|
)
|
|
normalized.append(
|
|
CodebookExample(
|
|
example_id=str(raw.get("example_id") or _make_template_id(raw)),
|
|
chapter_id=chapter_id,
|
|
chapter_name=chapter_names.get(chapter_id, chapter_id),
|
|
subchapter_id=subchapter_id,
|
|
subchapter_name=subchapter_names.get(subchapter_id, subchapter_id),
|
|
title=title,
|
|
template_name=template_name,
|
|
component_type=component_type,
|
|
accepted_shapes=tuple(raw.get("accepted_shapes") or []),
|
|
example_json=example_json,
|
|
quality_notes=str(raw.get("quality_notes") or ""),
|
|
is_canonical=bool(raw.get("is_canonical")),
|
|
source_pack=source_pack,
|
|
surface_targets=tuple(raw.get("surface_targets") or []),
|
|
policy_tags=tuple(raw.get("policy_tags") or []),
|
|
backend_contract_hints=dict(raw.get("backend_contract_hints") or {}),
|
|
score_terms=tuple(terms),
|
|
)
|
|
)
|
|
return normalized
|
|
|
|
|
|
class OracleCodebookService:
|
|
def __init__(self) -> None:
|
|
root = _repo_root()
|
|
self.runtime_merged_path = root / "backend" / "oracle" / "oracle_runtime_codebook_merged.json"
|
|
self.primary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "GPT 5.4" / "oracle_canvas_json_expansion_pack" / "db" / "oracle_template_seed_db_expanded_v1.pretty.json"
|
|
self.secondary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "Claude Sonnet 4.6" / "oracle_template_expansion" / "oracle_template_seed_db_expanded.json"
|
|
self.fallback_path = root / "backend" / "oracle" / "oracle_template_seed_db.json"
|
|
|
|
@lru_cache(maxsize=1)
|
|
def load(self) -> dict[str, Any]:
|
|
corpora: list[CodebookExample] = []
|
|
sources_loaded: list[str] = []
|
|
source_paths: list[tuple[Path, str]]
|
|
if self.runtime_merged_path.exists():
|
|
source_paths = [
|
|
(self.runtime_merged_path, "runtime_merged"),
|
|
(self.fallback_path, "runtime_seed_fallback"),
|
|
]
|
|
else:
|
|
source_paths = [
|
|
(self.primary_path, "gpt_5_4"),
|
|
(self.secondary_path, "claude_sonnet_4_6"),
|
|
(self.fallback_path, "runtime_seed_fallback"),
|
|
]
|
|
|
|
for path, label in source_paths:
|
|
if not path.exists():
|
|
continue
|
|
payload = _safe_load_json(path)
|
|
examples = _normalize_examples(payload, label)
|
|
if examples:
|
|
corpora.extend(examples)
|
|
sources_loaded.append(f"{label}:{len(examples)}")
|
|
|
|
deduped: dict[tuple[str, str, str], CodebookExample] = {}
|
|
for example in corpora:
|
|
key = (example.subchapter_id, example.template_name.lower(), example.title.lower())
|
|
existing = deduped.get(key)
|
|
if existing is None:
|
|
deduped[key] = example
|
|
continue
|
|
# Prefer canonical GPT examples, then canonical examples, then richer source pack.
|
|
if example.source_pack == "gpt_5_4" and existing.source_pack != "gpt_5_4":
|
|
deduped[key] = example
|
|
elif example.is_canonical and not existing.is_canonical:
|
|
deduped[key] = example
|
|
|
|
examples = list(deduped.values())
|
|
logger.info("Oracle codebook loaded from %s", ", ".join(sources_loaded) or "no sources")
|
|
return {
|
|
"examples": examples,
|
|
"source_summary": sources_loaded,
|
|
"template_count": len({(e.chapter_id, e.subchapter_id, e.template_name, e.component_type) for e in examples}),
|
|
}
|
|
|
|
def stats(self) -> dict[str, Any]:
|
|
data = self.load()
|
|
examples: list[CodebookExample] = data["examples"]
|
|
return {
|
|
"example_count": len(examples),
|
|
"template_count": data["template_count"],
|
|
"source_summary": data["source_summary"],
|
|
}
|
|
|
|
def list_templates(
|
|
self,
|
|
*,
|
|
category: str | None = None,
|
|
status: str | None = None,
|
|
search: str | None = None,
|
|
limit: int = 50,
|
|
offset: int = 0,
|
|
) -> dict[str, Any]:
|
|
del status # runtime codebook templates are always active catalog entries
|
|
examples: list[CodebookExample] = self.load()["examples"]
|
|
templates: dict[str, dict[str, Any]] = {}
|
|
for example in examples:
|
|
if category and category.lower() not in {example.chapter_name.lower(), example.subchapter_name.lower()}:
|
|
continue
|
|
if search:
|
|
terms = set(example.score_terms)
|
|
if not set(_tokenize(search)).intersection(terms):
|
|
continue
|
|
template_id = _make_template_id(
|
|
{
|
|
"chapter_id": example.chapter_id,
|
|
"subchapter_id": example.subchapter_id,
|
|
"template_name": example.template_name,
|
|
"component_type": example.component_type,
|
|
}
|
|
)
|
|
record = templates.get(template_id)
|
|
if record is None:
|
|
templates[template_id] = {
|
|
"templateId": template_id,
|
|
"tenantId": "_system",
|
|
"name": example.template_name,
|
|
"category": example.chapter_name,
|
|
"status": "catalog_active",
|
|
"origin": "premade",
|
|
"version": "codebook-v2",
|
|
"acceptedShapes": list(example.accepted_shapes),
|
|
"description": f"{example.subchapter_name} · {example.title}",
|
|
"chapterId": example.chapter_id,
|
|
"subchapterId": example.subchapter_id,
|
|
"componentType": example.component_type,
|
|
"sourcePack": example.source_pack,
|
|
"useCount": 0,
|
|
"updatedAt": None,
|
|
"createdAt": None,
|
|
}
|
|
ordered = list(templates.values())
|
|
ordered.sort(key=lambda item: (item["category"], item["name"]))
|
|
total = len(ordered)
|
|
return {
|
|
"total": total,
|
|
"templates": ordered[offset: offset + limit],
|
|
}
|
|
|
|
def search_examples(self, prompt: str, *, limit: int = 8) -> list[CodebookExample]:
|
|
prompt_terms = set(_tokenize(prompt))
|
|
if not prompt_terms:
|
|
prompt_terms = set(_tokenize(prompt.replace("_", " ")))
|
|
|
|
lowered_prompt = prompt.lower()
|
|
crm_prompt = any(term in lowered_prompt for term in ("client", "clients", "contact", "contacts", "crm", "lead", "account"))
|
|
interaction_prompt = any(term in lowered_prompt for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up"))
|
|
property_prompt = any(term in lowered_prompt for term in ("property", "properties", "project", "projects", "interest", "interested"))
|
|
|
|
scored: list[tuple[int, CodebookExample]] = []
|
|
for example in self.load()["examples"]:
|
|
score = 0
|
|
term_set = set(example.score_terms)
|
|
overlap = prompt_terms.intersection(term_set)
|
|
score += len(overlap) * 6
|
|
if example.template_name.lower() in lowered_prompt:
|
|
score += 24
|
|
if example.subchapter_name.lower() in lowered_prompt:
|
|
score += 20
|
|
if example.chapter_name.lower() in lowered_prompt:
|
|
score += 14
|
|
if example.component_type.replace("_", " ") in lowered_prompt:
|
|
score += 12
|
|
if example.is_canonical:
|
|
score += 8
|
|
if "live_data_first" in example.policy_tags:
|
|
score += 4
|
|
chapter = example.chapter_name.lower()
|
|
subchapter = example.subchapter_name.lower()
|
|
title = example.title.lower()
|
|
if crm_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("lead", "client", "contact", "crm", "account", "pipeline")):
|
|
score += 18
|
|
if interaction_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up")):
|
|
score += 16
|
|
if property_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("property", "inventory", "interest", "project")):
|
|
score += 16
|
|
if score > 0:
|
|
scored.append((score, example))
|
|
|
|
scored.sort(key=lambda item: (-item[0], item[1].chapter_id, item[1].subchapter_id, item[1].title))
|
|
selected: list[CodebookExample] = []
|
|
seen: set[tuple[str, str]] = set()
|
|
for _, example in scored:
|
|
dedupe_key = (example.subchapter_id, example.template_name)
|
|
if dedupe_key in seen:
|
|
continue
|
|
seen.add(dedupe_key)
|
|
selected.append(example)
|
|
if len(selected) >= limit:
|
|
break
|
|
return selected
|
|
|
|
def synthesize_template(self, prompt: str, data_shapes: list[str] | None = None) -> dict[str, Any]:
|
|
match = next(iter(self.search_examples(prompt, limit=1)), None)
|
|
shapes = data_shapes or []
|
|
if match is None:
|
|
return {
|
|
"templateId": hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:16],
|
|
"tenantId": "_system",
|
|
"name": "Oracle Synthesized Draft",
|
|
"category": "Custom",
|
|
"status": "tenant_draft",
|
|
"origin": "synthesized",
|
|
"version": "1.0.0",
|
|
"acceptedShapes": shapes,
|
|
"description": f"Draft synthesized from prompt: {prompt[:120]}",
|
|
}
|
|
|
|
return {
|
|
"templateId": _make_template_id(
|
|
{
|
|
"chapter_id": match.chapter_id,
|
|
"subchapter_id": match.subchapter_id,
|
|
"template_name": match.template_name,
|
|
"component_type": match.component_type,
|
|
}
|
|
),
|
|
"tenantId": "_system",
|
|
"name": match.template_name,
|
|
"category": match.chapter_name,
|
|
"status": "catalog_active",
|
|
"origin": "premade",
|
|
"version": "codebook-v2",
|
|
"acceptedShapes": list(match.accepted_shapes or shapes),
|
|
"description": f"Best codebook match · {match.subchapter_name}",
|
|
"componentType": match.component_type,
|
|
"chapterId": match.chapter_id,
|
|
"subchapterId": match.subchapter_id,
|
|
"sourcePack": match.source_pack,
|
|
"exampleJson": match.example_json,
|
|
}
|
|
|
|
|
|
codebook_service = OracleCodebookService()
|