Files
Project_Velocity/backend/oracle/codebook_service.py
2026-04-23 01:20:21 +05:30

354 lines
15 KiB
Python

"""
oracle/codebook_service.py
Loads, normalizes, and retrieves Oracle Canvas codebook examples from the
expanded GPT and Claude seed packs delivered in Sprint 1.
The runtime treats the GPT pack as the primary normalized corpus and uses the
Claude pack as a supplement when it adds unique examples or metadata.
"""
from __future__ import annotations
import hashlib
import json
import logging
import re
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
_TOKEN_RE = re.compile(r"[a-z0-9]+")
_STOPWORDS = {
"a", "an", "and", "as", "at", "build", "canvas", "chart", "client", "clients",
"for", "from", "get", "give", "in", "into", "is", "list", "me", "of", "on",
"or", "oracle", "please", "render", "show", "surface", "that", "the", "this",
"to", "view", "with",
}
@dataclass(frozen=True)
class CodebookExample:
example_id: str
chapter_id: str
chapter_name: str
subchapter_id: str
subchapter_name: str
title: str
template_name: str
component_type: str
accepted_shapes: tuple[str, ...]
example_json: dict[str, Any]
quality_notes: str
is_canonical: bool
source_pack: str
surface_targets: tuple[str, ...]
policy_tags: tuple[str, ...]
backend_contract_hints: dict[str, Any]
score_terms: tuple[str, ...]
def _repo_root() -> Path:
return Path(__file__).resolve().parents[2]
def _safe_load_json(path: Path) -> dict[str, Any]:
with path.open("r", encoding="utf-8") as handle:
return json.load(handle)
def _tokenize(value: str) -> list[str]:
lowered = value.lower()
return [tok for tok in _TOKEN_RE.findall(lowered) if tok not in _STOPWORDS and len(tok) > 1]
def _make_template_id(example: dict[str, Any]) -> str:
base = "|".join(
[
example.get("chapter_id", ""),
example.get("subchapter_id", ""),
example.get("template_name", ""),
example.get("component_type", ""),
]
)
return hashlib.sha1(base.encode("utf-8")).hexdigest()[:16]
def _chapter_maps(payload: dict[str, Any]) -> tuple[dict[str, str], dict[str, str]]:
chapters: dict[str, str] = {}
subchapters: dict[str, str] = {}
for chapter in payload.get("chapters", []):
chapter_id = str(chapter.get("chapter_id", "")).strip()
if chapter_id:
chapters[chapter_id] = str(chapter.get("name", "")).strip()
for subchapter in chapter.get("subchapters", []):
sub_id = str(subchapter.get("subchapter_id", "")).strip()
if sub_id:
subchapters[sub_id] = str(subchapter.get("name", "")).strip()
return chapters, subchapters
def _normalize_examples(payload: dict[str, Any], source_pack: str) -> list[CodebookExample]:
chapter_names, subchapter_names = _chapter_maps(payload)
raw_examples = payload.get("seed_examples") or payload.get("examples") or []
normalized: list[CodebookExample] = []
for raw in raw_examples:
chapter_id = str(raw.get("chapter_id", "")).strip()
subchapter_id = str(raw.get("subchapter_id", "")).strip()
title = str(raw.get("title") or raw.get("template_name") or "Oracle Component").strip()
template_name = str(raw.get("template_name") or title).strip()
component_type = str(raw.get("component_type") or "summary_card").strip()
example_json = raw.get("example_json") or {}
terms = _tokenize(
" ".join(
[
title,
template_name,
component_type.replace("_", " "),
chapter_names.get(chapter_id, ""),
subchapter_names.get(subchapter_id, ""),
str(raw.get("quality_notes", "")),
" ".join(raw.get("policy_tags", []) or []),
]
)
)
normalized.append(
CodebookExample(
example_id=str(raw.get("example_id") or _make_template_id(raw)),
chapter_id=chapter_id,
chapter_name=chapter_names.get(chapter_id, chapter_id),
subchapter_id=subchapter_id,
subchapter_name=subchapter_names.get(subchapter_id, subchapter_id),
title=title,
template_name=template_name,
component_type=component_type,
accepted_shapes=tuple(raw.get("accepted_shapes") or []),
example_json=example_json,
quality_notes=str(raw.get("quality_notes") or ""),
is_canonical=bool(raw.get("is_canonical")),
source_pack=source_pack,
surface_targets=tuple(raw.get("surface_targets") or []),
policy_tags=tuple(raw.get("policy_tags") or []),
backend_contract_hints=dict(raw.get("backend_contract_hints") or {}),
score_terms=tuple(terms),
)
)
return normalized
class OracleCodebookService:
def __init__(self) -> None:
root = _repo_root()
self.runtime_merged_path = root / "backend" / "oracle" / "oracle_runtime_codebook_merged.json"
self.primary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "GPT 5.4" / "oracle_canvas_json_expansion_pack" / "db" / "oracle_template_seed_db_expanded_v1.pretty.json"
self.secondary_path = root / ".Agent Context" / "Sprint 1" / "Sayan Multi-Surface and Oracle Delivery Pack" / "Sample JSON Schema" / "Claude Sonnet 4.6" / "oracle_template_expansion" / "oracle_template_seed_db_expanded.json"
self.fallback_path = root / "backend" / "oracle" / "oracle_template_seed_db.json"
@lru_cache(maxsize=1)
def load(self) -> dict[str, Any]:
corpora: list[CodebookExample] = []
sources_loaded: list[str] = []
source_paths: list[tuple[Path, str]]
if self.runtime_merged_path.exists():
source_paths = [
(self.runtime_merged_path, "runtime_merged"),
(self.fallback_path, "runtime_seed_fallback"),
]
else:
source_paths = [
(self.primary_path, "gpt_5_4"),
(self.secondary_path, "claude_sonnet_4_6"),
(self.fallback_path, "runtime_seed_fallback"),
]
for path, label in source_paths:
if not path.exists():
continue
payload = _safe_load_json(path)
examples = _normalize_examples(payload, label)
if examples:
corpora.extend(examples)
sources_loaded.append(f"{label}:{len(examples)}")
deduped: dict[tuple[str, str, str], CodebookExample] = {}
for example in corpora:
key = (example.subchapter_id, example.template_name.lower(), example.title.lower())
existing = deduped.get(key)
if existing is None:
deduped[key] = example
continue
# Prefer canonical GPT examples, then canonical examples, then richer source pack.
if example.source_pack == "gpt_5_4" and existing.source_pack != "gpt_5_4":
deduped[key] = example
elif example.is_canonical and not existing.is_canonical:
deduped[key] = example
examples = list(deduped.values())
logger.info("Oracle codebook loaded from %s", ", ".join(sources_loaded) or "no sources")
return {
"examples": examples,
"source_summary": sources_loaded,
"template_count": len({(e.chapter_id, e.subchapter_id, e.template_name, e.component_type) for e in examples}),
}
def stats(self) -> dict[str, Any]:
data = self.load()
examples: list[CodebookExample] = data["examples"]
return {
"example_count": len(examples),
"template_count": data["template_count"],
"source_summary": data["source_summary"],
}
def list_templates(
self,
*,
category: str | None = None,
status: str | None = None,
search: str | None = None,
limit: int = 50,
offset: int = 0,
) -> dict[str, Any]:
del status # runtime codebook templates are always active catalog entries
examples: list[CodebookExample] = self.load()["examples"]
templates: dict[str, dict[str, Any]] = {}
for example in examples:
if category and category.lower() not in {example.chapter_name.lower(), example.subchapter_name.lower()}:
continue
if search:
terms = set(example.score_terms)
if not set(_tokenize(search)).intersection(terms):
continue
template_id = _make_template_id(
{
"chapter_id": example.chapter_id,
"subchapter_id": example.subchapter_id,
"template_name": example.template_name,
"component_type": example.component_type,
}
)
record = templates.get(template_id)
if record is None:
templates[template_id] = {
"templateId": template_id,
"tenantId": "_system",
"name": example.template_name,
"category": example.chapter_name,
"status": "catalog_active",
"origin": "premade",
"version": "codebook-v2",
"acceptedShapes": list(example.accepted_shapes),
"description": f"{example.subchapter_name} · {example.title}",
"chapterId": example.chapter_id,
"subchapterId": example.subchapter_id,
"componentType": example.component_type,
"sourcePack": example.source_pack,
"useCount": 0,
"updatedAt": None,
"createdAt": None,
}
ordered = list(templates.values())
ordered.sort(key=lambda item: (item["category"], item["name"]))
total = len(ordered)
return {
"total": total,
"templates": ordered[offset: offset + limit],
}
def search_examples(self, prompt: str, *, limit: int = 8) -> list[CodebookExample]:
prompt_terms = set(_tokenize(prompt))
if not prompt_terms:
prompt_terms = set(_tokenize(prompt.replace("_", " ")))
lowered_prompt = prompt.lower()
crm_prompt = any(term in lowered_prompt for term in ("client", "clients", "contact", "contacts", "crm", "lead", "account"))
interaction_prompt = any(term in lowered_prompt for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up"))
property_prompt = any(term in lowered_prompt for term in ("property", "properties", "project", "projects", "interest", "interested"))
scored: list[tuple[int, CodebookExample]] = []
for example in self.load()["examples"]:
score = 0
term_set = set(example.score_terms)
overlap = prompt_terms.intersection(term_set)
score += len(overlap) * 6
if example.template_name.lower() in lowered_prompt:
score += 24
if example.subchapter_name.lower() in lowered_prompt:
score += 20
if example.chapter_name.lower() in lowered_prompt:
score += 14
if example.component_type.replace("_", " ") in lowered_prompt:
score += 12
if example.is_canonical:
score += 8
if "live_data_first" in example.policy_tags:
score += 4
chapter = example.chapter_name.lower()
subchapter = example.subchapter_name.lower()
title = example.title.lower()
if crm_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("lead", "client", "contact", "crm", "account", "pipeline")):
score += 18
if interaction_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("interaction", "timeline", "call", "message", "email", "whatsapp", "follow-up")):
score += 16
if property_prompt and any(term in " ".join((chapter, subchapter, title, example.template_name.lower())) for term in ("property", "inventory", "interest", "project")):
score += 16
if score > 0:
scored.append((score, example))
scored.sort(key=lambda item: (-item[0], item[1].chapter_id, item[1].subchapter_id, item[1].title))
selected: list[CodebookExample] = []
seen: set[tuple[str, str]] = set()
for _, example in scored:
dedupe_key = (example.subchapter_id, example.template_name)
if dedupe_key in seen:
continue
seen.add(dedupe_key)
selected.append(example)
if len(selected) >= limit:
break
return selected
def synthesize_template(self, prompt: str, data_shapes: list[str] | None = None) -> dict[str, Any]:
match = next(iter(self.search_examples(prompt, limit=1)), None)
shapes = data_shapes or []
if match is None:
return {
"templateId": hashlib.sha1(prompt.encode("utf-8")).hexdigest()[:16],
"tenantId": "_system",
"name": "Oracle Synthesized Draft",
"category": "Custom",
"status": "tenant_draft",
"origin": "synthesized",
"version": "1.0.0",
"acceptedShapes": shapes,
"description": f"Draft synthesized from prompt: {prompt[:120]}",
}
return {
"templateId": _make_template_id(
{
"chapter_id": match.chapter_id,
"subchapter_id": match.subchapter_id,
"template_name": match.template_name,
"component_type": match.component_type,
}
),
"tenantId": "_system",
"name": match.template_name,
"category": match.chapter_name,
"status": "catalog_active",
"origin": "premade",
"version": "codebook-v2",
"acceptedShapes": list(match.accepted_shapes or shapes),
"description": f"Best codebook match · {match.subchapter_name}",
"componentType": match.component_type,
"chapterId": match.chapter_id,
"subchapterId": match.subchapter_id,
"sourcePack": match.source_pack,
"exampleJson": match.example_json,
}
codebook_service = OracleCodebookService()