Files
Project_Velocity/backend/scripts/seed_synthetic_crm.py

726 lines
46 KiB
Python

#!/usr/bin/env python3
"""
Seed canonical CRM tables from db assets/synthetic_crm_v2/csv by default.
This is an initial canonical seed path, not the user-facing import approval flow.
It preserves source CSV IDs in metadata and uses deterministic UUIDs so reruns are
idempotent enough for local/prod refreshes without duplicate synthetic graphs.
"""
from __future__ import annotations
import argparse
import asyncio
import csv
import json
import logging
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("velocity.seed")
REPO_ROOT = Path(__file__).parent.parent.parent
_V2_CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v2" / "csv"
_V1_CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v1" / "csv"
CSV_DIR = _V2_CSV_DIR if _V2_CSV_DIR.exists() else _V1_CSV_DIR
NS = uuid.uuid5(uuid.NAMESPACE_URL, f"desineuron.project_velocity.{CSV_DIR.parent.name}")
def read_csv(filename: str, limit: int | None = None) -> list[dict[str, str]]:
path = CSV_DIR / filename
if not path.exists():
logger.warning("CSV not found: %s", path)
return []
with path.open(encoding="utf-8", newline="") as f:
rows = list(csv.DictReader(f))
return rows[:limit] if limit else rows
def stable_uuid(domain: str, source_id: str | None) -> str:
raw = (source_id or "").strip() or str(uuid.uuid4())
return str(uuid.uuid5(NS, f"{domain}:{raw}"))
def safe_json(value: str | None, default: Any) -> Any:
if not value or not value.strip():
return default
try:
return json.loads(value)
except json.JSONDecodeError:
return default
def safe_float(value: str | None, default: float | None = None) -> float | None:
if not value or value.strip() in {"", "null", "None", "nan"}:
return default
try:
return float(value)
except (TypeError, ValueError):
return default
def safe_int(value: str | None, default: int | None = None) -> int | None:
if not value or value.strip() in {"", "null", "None"}:
return default
try:
return int(float(value))
except (TypeError, ValueError):
return default
def normalize_score(value: str | None, default: float = 0.5) -> float:
score = safe_float(value, default)
if score is None:
score = default
if score > 1:
score = score / 100.0
return max(0.0, min(1.0, score))
def normalize_account_type(value: str | None) -> str:
raw = (value or "company").strip().lower()
aliases = {
"individual_business": "company",
"business": "company",
"corporate": "company",
"channel_partner": "broker",
}
normalized = aliases.get(raw, raw)
valid = {"individual", "company", "broker", "developer", "referral_partner", "nri_family"}
return normalized if normalized in valid else "company"
def normalize_relationship_type(value: str | None) -> str:
raw = (value or "family_member").strip().lower()
aliases = {
"partner": "business_partner",
"buyer": "co_buyer",
"cobuyer": "co_buyer",
"family": "family_member",
}
normalized = aliases.get(raw, raw)
valid = {"spouse", "parent", "sibling", "business_partner", "broker_referral", "co_buyer", "family_member", "advisor"}
return normalized if normalized in valid else "family_member"
def safe_dt(value: str | None) -> datetime | None:
if not value or value.strip() in {"", "null", "None"}:
return None
raw = value.strip().replace("Z", "+00:00")
try:
parsed = datetime.fromisoformat(raw)
return parsed if parsed.tzinfo else parsed.replace(tzinfo=timezone.utc)
except ValueError:
return None
def metadata(row: dict[str, str], *, source_id_key: str, used: set[str]) -> str:
extra = {k: v for k, v in row.items() if k not in used and v not in ("", None)}
existing = safe_json(row.get("metadata_json"), {})
if not isinstance(existing, dict):
existing = {}
existing.update(extra)
existing.update({"synthetic": True, "source_id": row.get(source_id_key, "")})
return json.dumps(existing)
async def maybe_execute(conn: Any, dry_run: bool, sql: str, *args: Any) -> None:
if not dry_run:
await conn.execute(sql, *args)
async def seed(dry_run: bool = False, limit: int | None = None) -> None:
from backend.db.pool import close_pool, create_pool
pool = await create_pool()
counts: dict[str, int] = {}
async with pool.acquire() as conn:
exists = await conn.fetchval("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'crm_people')")
if not exists:
logger.error("Canonical schema not found. Run schema_crm_canonical.sql first.")
await close_pool()
return
project_rows = read_csv("inventory_projects.csv")
project_map = {r["project_id"]: stable_uuid("inventory_projects", r["project_id"]) for r in project_rows}
project_name_by_source = {r["project_id"]: r.get("project_name", "") for r in project_rows}
for row in project_rows:
await maybe_execute(
conn, dry_run,
"""
INSERT INTO inventory_projects (project_id, project_name, developer_name, city, micro_market, location_json, metadata_json)
VALUES ($1::uuid, $2, $3, $4, $5, $6::jsonb, $7::jsonb)
ON CONFLICT (project_name) DO UPDATE SET
project_id = EXCLUDED.project_id,
developer_name = EXCLUDED.developer_name,
micro_market = EXCLUDED.micro_market,
location_json = EXCLUDED.location_json,
metadata_json = EXCLUDED.metadata_json
""",
project_map[row["project_id"]], row.get("project_name"), row.get("developer_name"), row.get("city") or "Kolkata",
row.get("micro_market"), json.dumps(safe_json(row.get("location_json"), {})),
metadata(row, source_id_key="project_id", used={"project_id", "project_name", "developer_name", "city", "micro_market", "location_json", "metadata_json"}),
)
counts["inventory_projects"] = len(project_rows)
unit_rows = read_csv("inventory_units.csv")
unit_map = {r["unit_id"]: stable_uuid("inventory_units", r["unit_id"]) for r in unit_rows}
for row in unit_rows:
project_id = project_map.get(row.get("project_id", ""))
if not project_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO inventory_units (unit_id, project_id, unit_label, configuration, area_sqft, price_current, status, facing, floor, metadata_json)
VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8, $9, $10::jsonb)
ON CONFLICT (project_id, unit_label) DO UPDATE SET price_current = EXCLUDED.price_current, status = EXCLUDED.status, metadata_json = EXCLUDED.metadata_json
""",
unit_map[row["unit_id"]], project_id, row.get("unit_label"), row.get("configuration") or "Unknown",
safe_float(row.get("area_sqft")), safe_float(row.get("price_current")), row.get("status") or "available",
row.get("facing"), safe_int(row.get("floor")),
metadata(row, source_id_key="unit_id", used={"unit_id", "project_id", "unit_label", "configuration", "area_sqft", "price_current", "status", "facing", "floor", "metadata_json"}),
)
counts["inventory_units"] = len(unit_rows)
people_rows = read_csv("crm_people.csv", limit=limit)
person_map = {r["person_id"]: stable_uuid("crm_people", r["person_id"]) for r in people_rows}
for row in people_rows:
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_people (person_id, full_name, primary_email, primary_phone, linkedin_url, persona_labels, source_confidence, legacy_lead_id, metadata_json, created_at, updated_at)
VALUES ($1::uuid, $2, $3, $4, $5, $6::jsonb, $7, $8, $9::jsonb, COALESCE($10, NOW()), COALESCE($11, NOW()))
ON CONFLICT (person_id) DO UPDATE SET full_name = EXCLUDED.full_name, primary_email = EXCLUDED.primary_email, primary_phone = EXCLUDED.primary_phone, metadata_json = EXCLUDED.metadata_json
""",
person_map[row["person_id"]], row.get("full_name"), row.get("primary_email") or None, row.get("primary_phone") or None,
row.get("linkedin_url") or None, json.dumps(safe_json(row.get("persona_labels"), [])),
safe_float(row.get("source_confidence"), 0.8), row.get("person_id"),
metadata(row, source_id_key="person_id", used={"person_id", "full_name", "primary_email", "primary_phone", "linkedin_url", "persona_labels", "source_confidence", "created_at", "updated_at", "metadata_json"}),
safe_dt(row.get("created_at")), safe_dt(row.get("updated_at")),
)
counts["crm_people"] = len(people_rows)
account_rows = read_csv("crm_accounts.csv")
account_map = {r["account_id"]: stable_uuid("crm_accounts", r["account_id"]) for r in account_rows}
for row in account_rows:
parent_id = account_map.get(row.get("parent_account_id", ""))
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_accounts (account_id, account_name, parent_account_id, account_type, industry, location_ref, metadata_json)
VALUES ($1::uuid, $2, NULLIF($3, '')::uuid, $4::crm_account_type, $5, $6, $7::jsonb)
ON CONFLICT (account_id) DO UPDATE SET account_name = EXCLUDED.account_name, metadata_json = EXCLUDED.metadata_json
""",
account_map[row["account_id"]], row.get("account_name"), parent_id or "", normalize_account_type(row.get("account_type")),
row.get("industry"), row.get("location_ref"), metadata(row, source_id_key="account_id", used={"account_id", "account_name", "parent_account_id", "account_type", "industry", "location_ref", "metadata_json"}),
)
counts["crm_accounts"] = len(account_rows)
lead_rows = read_csv("crm_leads.csv", limit=limit)
lead_map = {r["lead_id"]: stable_uuid("crm_leads", r["lead_id"]) for r in lead_rows if r.get("person_id") in person_map}
valid_status = {"new", "contacted", "qualified", "site_visit_scheduled", "site_visited", "negotiation", "booking_initiated", "booked", "lost", "dormant"}
for row in lead_rows:
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
status = (row.get("status") or "new").lower().strip()
if status not in valid_status:
status = "new"
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_leads (lead_id, person_id, account_id, source_system, status, budget_band, urgency, assigned_user_id, legacy_lead_id, metadata_json, created_at, updated_at)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, $4, $5::crm_lead_status, $6, $7, NULLIF($8, '')::uuid, $9, $10::jsonb, COALESCE($11, NOW()), COALESCE($11, NOW()))
ON CONFLICT (lead_id) DO UPDATE SET status = EXCLUDED.status, budget_band = EXCLUDED.budget_band, urgency = EXCLUDED.urgency, metadata_json = EXCLUDED.metadata_json
""",
lead_map[row["lead_id"]], person_id, account_map.get(row.get("account_id", ""), ""), row.get("source_system") or "csv_upload",
status, row.get("budget_band"), row.get("urgency"), "", row.get("lead_id"),
metadata(row, source_id_key="lead_id", used={"lead_id", "person_id", "account_id", "source_system", "status", "budget_band", "urgency", "assigned_user_id", "created_at", "metadata_json"}),
safe_dt(row.get("created_at")),
)
counts["crm_leads"] = len(lead_map)
household_rows = read_csv("crm_households.csv")
household_map = {r["household_id"]: stable_uuid("crm_households", r["household_id"]) for r in household_rows}
for row in household_rows:
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_households (household_id, household_name, primary_person_id, metadata_json)
VALUES ($1::uuid, $2, NULLIF($3, '')::uuid, $4::jsonb)
ON CONFLICT (household_id) DO UPDATE SET household_name = EXCLUDED.household_name, metadata_json = EXCLUDED.metadata_json
""",
household_map[row["household_id"]], row.get("household_name"), person_map.get(row.get("primary_contact_id", ""), ""),
metadata(row, source_id_key="household_id", used={"household_id", "household_name", "primary_contact_id", "metadata_json"}),
)
counts["crm_households"] = len(household_rows)
relationship_rows = read_csv("crm_relationships.csv")
for row in relationship_rows:
a_id = person_map.get(row.get("from_person_id", ""))
b_id = person_map.get(row.get("to_person_id", ""))
if not a_id or not b_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_relationships (relationship_id, person_a_id, person_b_id, relationship_type, notes)
VALUES ($1::uuid, $2::uuid, $3::uuid, $4::crm_relationship_type, $5)
ON CONFLICT (person_a_id, person_b_id, relationship_type) DO NOTHING
""",
stable_uuid("crm_relationships", row.get("relationship_id")), a_id, b_id, normalize_relationship_type(row.get("relationship_type")),
json.dumps({"strength_score": row.get("strength_score"), "metadata": safe_json(row.get("metadata_json"), {})}),
)
counts["crm_relationships"] = len(relationship_rows)
opportunity_rows = read_csv("crm_opportunities.csv")
opportunity_map = {r["opportunity_id"]: stable_uuid("crm_opportunities", r["opportunity_id"]) for r in opportunity_rows}
valid_stage = {"prospect", "qualified", "proposal", "site_visit", "negotiation", "booking", "agreement", "closed_won", "closed_lost"}
for row in opportunity_rows:
lead_id = lead_map.get(row.get("lead_id", ""))
if not lead_id:
continue
stage = row.get("stage") or "prospect"
if stage not in valid_stage:
stage = "prospect"
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_opportunities (opportunity_id, lead_id, project_id, unit_id, stage, value, probability, expected_close_date, next_action, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, NULLIF($4, '')::uuid, $5::crm_opportunity_stage, $6, $7, $8, $9, $10::jsonb)
ON CONFLICT (opportunity_id) DO UPDATE SET stage = EXCLUDED.stage, value = EXCLUDED.value, probability = EXCLUDED.probability, next_action = EXCLUDED.next_action, metadata_json = EXCLUDED.metadata_json
""",
opportunity_map[row["opportunity_id"]], lead_id, project_map.get(row.get("project_id", ""), ""),
unit_map.get(row.get("unit_id", ""), ""), stage, safe_float(row.get("value")), safe_int(row.get("probability")),
safe_dt(row.get("expected_close_date")), row.get("next_action"),
metadata(row, source_id_key="opportunity_id", used={"opportunity_id", "lead_id", "project_id", "unit_id", "stage", "value", "probability", "expected_close_date", "next_action", "metadata_json"}),
)
counts["crm_opportunities"] = len(opportunity_rows)
pi_rows = read_csv("crm_property_interests.csv")
for row in pi_rows:
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
project_id = project_map.get(row.get("project_id", ""))
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_property_interests (interest_id, person_id, lead_id, project_id, project_name, unit_preference, configuration, budget_min, budget_max, priority, notes, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, NULLIF($4, '')::uuid, $5, NULLIF($6, '')::text, $7, $8, $9, $10, $11, $12::jsonb)
ON CONFLICT (interest_id) DO UPDATE SET project_name = EXCLUDED.project_name, configuration = EXCLUDED.configuration, budget_min = EXCLUDED.budget_min, budget_max = EXCLUDED.budget_max, notes = EXCLUDED.notes, metadata_json = EXCLUDED.metadata_json
""",
stable_uuid("crm_property_interests", row.get("interest_id")), person_id, lead_map.get(row.get("lead_id", ""), ""),
project_id or "", project_name_by_source.get(row.get("project_id", ""), row.get("project_name") or "Unknown Project"),
row.get("unit_id") or "", row.get("configuration_preference") or row.get("configuration"),
safe_float(row.get("budget_min")), safe_float(row.get("budget_max")), 1, row.get("notes"),
metadata(row, source_id_key="interest_id", used={"interest_id", "person_id", "lead_id", "project_id", "project_name", "unit_id", "configuration_preference", "configuration", "budget_min", "budget_max", "notes", "metadata_json"}),
)
counts["crm_property_interests"] = len(pi_rows)
stage_rows = read_csv("crm_stage_history.csv")
for row in stage_rows:
lead_id = lead_map.get(row.get("lead_id", ""))
if not lead_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO crm_stage_history (history_id, lead_id, from_status, to_status, changed_by_type, notes, happened_at)
VALUES ($1::uuid, $2::uuid, $3, $4, 'system', $5, COALESCE($6, NOW()))
ON CONFLICT (history_id) DO UPDATE SET from_status = EXCLUDED.from_status, to_status = EXCLUDED.to_status, notes = EXCLUDED.notes, happened_at = EXCLUDED.happened_at
""",
stable_uuid("crm_stage_history", row.get("history_id")), lead_id, row.get("from_stage"),
row.get("to_stage") or "new", row.get("reason"), safe_dt(row.get("changed_at")),
)
counts["crm_stage_history"] = len(stage_rows)
interaction_rows = read_csv("intel_interactions.csv")
interaction_map = {r["interaction_id"]: stable_uuid("intel_interactions", r["interaction_id"]) for r in interaction_rows}
valid_channels = {"whatsapp", "phone", "email", "site_visit", "office_meeting", "video_call", "cctv", "perception_session", "system"}
for row in interaction_rows:
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
channel = (row.get("channel") or "system").lower()
if channel not in valid_channels:
channel = "system"
await maybe_execute(
conn, dry_run,
"""
INSERT INTO intel_interactions (interaction_id, person_id, lead_id, channel, interaction_type, happened_at, summary, source_ref, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, $4::intel_channel, $5, COALESCE($6, NOW()), $7, $8, $9::jsonb)
ON CONFLICT (interaction_id) DO UPDATE SET summary = EXCLUDED.summary, metadata_json = EXCLUDED.metadata_json
""",
interaction_map[row["interaction_id"]], person_id, lead_map.get(row.get("lead_id", ""), ""), channel,
row.get("interaction_type") or "message", safe_dt(row.get("happened_at")), row.get("summary"), row.get("source_ref"),
metadata(row, source_id_key="interaction_id", used={"interaction_id", "person_id", "lead_id", "channel", "interaction_type", "happened_at", "summary", "source_ref", "metadata_json"}),
)
counts["intel_interactions"] = len(interaction_rows)
counts.update(await self_seed_intel(conn, dry_run, person_map, lead_map, interaction_map, unit_map, project_map, project_name_by_source, opportunity_map))
counts.update(await self_seed_v2_enrichment(conn, dry_run, person_map, interaction_map))
counts.update(await self_seed_workflow(conn, dry_run))
for table in [
"crm_people", "crm_leads", "crm_opportunities", "crm_property_interests", "intel_interactions",
"intel_messages", "intel_calls", "intel_transcripts", "intel_emails", "intel_visits",
"intel_reminders", "intel_qd_scores", "intel_qd_timeseries", "inventory_projects", "inventory_units",
]:
exists = await conn.fetchval("SELECT to_regclass($1)", f"public.{table}")
actual = await conn.fetchval(f"SELECT COUNT(*) FROM {table}") if exists and not dry_run else counts.get(table, 0)
logger.info("row_count %-28s expected_or_processed=%s actual=%s", table, counts.get(table, 0), actual)
await close_pool()
async def self_seed_intel(conn: Any, dry_run: bool, person_map: dict[str, str], lead_map: dict[str, str], interaction_map: dict[str, str], unit_map: dict[str, str], project_map: dict[str, str], project_name_by_source: dict[str, str], opportunity_map: dict[str, str]) -> dict[str, int]:
counts: dict[str, int] = {}
for row in read_csv("intel_messages.csv"):
interaction_id = interaction_map.get(row.get("interaction_id", ""))
if not interaction_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_messages (message_id, interaction_id, sender_role, message_text, delivered_at, metadata_json)
VALUES ($1::uuid, $2::uuid, $3, $4, COALESCE($5, NOW()), $6::jsonb)
ON CONFLICT (message_id) DO UPDATE SET message_text = EXCLUDED.message_text, metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_messages", row.get("message_id")), interaction_id, row.get("sender_role") or "lead", row.get("message_text") or "", safe_dt(row.get("delivered_at")), metadata(row, source_id_key="message_id", used={"message_id", "interaction_id", "sender_role", "message_text", "delivered_at", "metadata_json"}))
counts["intel_messages"] = len(read_csv("intel_messages.csv"))
for row in read_csv("intel_calls.csv"):
interaction_id = interaction_map.get(row.get("interaction_id", ""))
if not interaction_id:
continue
direction = row.get("call_direction") if row.get("call_direction") in {"inbound", "outbound"} else "outbound"
await maybe_execute(conn, dry_run, """
INSERT INTO intel_calls (call_id, interaction_id, call_direction, duration_seconds, recording_ref, transcript_ref, metadata_json)
VALUES ($1::uuid, $2::uuid, $3::intel_call_direction, $4, $5, $6, $7::jsonb)
ON CONFLICT (call_id) DO UPDATE SET duration_seconds = EXCLUDED.duration_seconds, recording_ref = EXCLUDED.recording_ref, transcript_ref = EXCLUDED.transcript_ref
""", stable_uuid("intel_calls", row.get("call_id")), interaction_id, direction, safe_int(row.get("duration_seconds")), row.get("recording_ref"), row.get("transcript_ref"), metadata(row, source_id_key="call_id", used={"call_id", "interaction_id", "call_direction", "duration_seconds", "recording_ref", "transcript_ref", "metadata_json"}))
counts["intel_calls"] = len(read_csv("intel_calls.csv"))
call_map = {r["call_id"]: stable_uuid("intel_calls", r["call_id"]) for r in read_csv("intel_calls.csv")}
for row in read_csv("intel_transcripts.csv"):
call_id = call_map.get(row.get("call_id", ""))
await maybe_execute(conn, dry_run, """
INSERT INTO intel_transcripts (transcript_id, call_id, language, full_text, speaker_segments_json, confidence, metadata_json)
VALUES ($1::uuid, NULLIF($2, '')::uuid, $3, $4, $5::jsonb, $6, $7::jsonb)
ON CONFLICT (transcript_id) DO UPDATE SET full_text = EXCLUDED.full_text, speaker_segments_json = EXCLUDED.speaker_segments_json
""", stable_uuid("intel_transcripts", row.get("transcript_id")), call_id or "", row.get("language") or "en", row.get("full_text"), json.dumps(safe_json(row.get("speaker_segments_json"), [])), safe_float(row.get("confidence")), metadata(row, source_id_key="transcript_id", used={"transcript_id", "call_id", "language", "full_text", "speaker_segments_json", "confidence", "metadata_json"}))
counts["intel_transcripts"] = len(read_csv("intel_transcripts.csv"))
for row in read_csv("intel_emails.csv"):
interaction_id = interaction_map.get(row.get("interaction_id", ""))
if not interaction_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_emails (email_id, interaction_id, from_address, to_addresses, subject, body_text, sent_at, metadata_json)
VALUES ($1::uuid, $2::uuid, $3, $4::jsonb, $5, $6, COALESCE($7, NOW()), $8::jsonb)
ON CONFLICT (email_id) DO UPDATE SET subject = EXCLUDED.subject, body_text = EXCLUDED.body_text
""", stable_uuid("intel_emails", row.get("email_id")), interaction_id, row.get("sender"), json.dumps([row.get("recipient")] if row.get("recipient") else []), row.get("thread_subject"), row.get("body_text"), safe_dt(row.get("sent_at")), metadata(row, source_id_key="email_id", used={"email_id", "interaction_id", "thread_subject", "sender", "recipient", "body_text", "sent_at", "metadata_json"}))
counts["intel_emails"] = len(read_csv("intel_emails.csv"))
for row in read_csv("intel_whatsapp_threads.csv"):
interaction_id = interaction_map.get(row.get("interaction_id", ""))
person_id = None
if interaction_id:
person_id = await conn.fetchval("SELECT person_id::text FROM intel_interactions WHERE interaction_id = $1::uuid", interaction_id) if not dry_run else next(iter(person_map.values()), None)
if not person_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_whatsapp_threads (thread_id, person_id, lead_id, phone_number, message_count, last_message_at, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, $4, $5, $6, $7::jsonb)
ON CONFLICT (thread_id) DO UPDATE SET message_count = EXCLUDED.message_count, last_message_at = EXCLUDED.last_message_at
""", stable_uuid("intel_whatsapp_threads", row.get("thread_id")), person_id, "", row.get("phone_number"), safe_int(row.get("message_count"), 0), safe_dt(row.get("last_message_at")), metadata(row, source_id_key="thread_id", used={"thread_id", "interaction_id", "phone_number", "message_count", "last_message_at", "metadata_json"}))
counts["intel_whatsapp_threads"] = len(read_csv("intel_whatsapp_threads.csv"))
for row in read_csv("intel_visits.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_visits (visit_id, person_id, lead_id, project_id, project_name, unit_id, visited_at, visit_notes, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, NULLIF($4, '')::uuid, $5, NULLIF($6, '')::uuid, COALESCE($7, NOW()), $8, $9::jsonb)
ON CONFLICT (visit_id) DO UPDATE SET visit_notes = EXCLUDED.visit_notes, metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_visits", row.get("visit_id")), person_id, lead_map.get(row.get("lead_id", ""), ""), project_map.get(row.get("project_id", ""), ""), project_name_by_source.get(row.get("project_id", "")), unit_map.get(row.get("unit_id", ""), ""), safe_dt(row.get("visited_at")), row.get("visit_notes"), metadata(row, source_id_key="visit_id", used={"visit_id", "person_id", "lead_id", "project_id", "unit_id", "visited_at", "visit_notes", "metadata_json"}))
counts["intel_visits"] = len(read_csv("intel_visits.csv"))
for row in read_csv("intel_reminders.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_reminders (reminder_id, person_id, lead_id, reminder_type, title, notes, due_at, status, assigned_to, created_by_type, metadata_json)
VALUES ($1::uuid, $2::uuid, NULLIF($3, '')::uuid, 'follow_up', $4, $5, $6, $7, NULLIF($8, '')::uuid, 'system', $9::jsonb)
ON CONFLICT (reminder_id) DO UPDATE SET title = EXCLUDED.title, due_at = EXCLUDED.due_at, status = EXCLUDED.status
""", stable_uuid("intel_reminders", row.get("reminder_id")), person_id, lead_map.get(row.get("lead_id", ""), ""), row.get("reminder_text") or "Follow up", row.get("reminder_text"), safe_dt(row.get("due_at")), row.get("status") or "pending", "", metadata(row, source_id_key="reminder_id", used={"reminder_id", "person_id", "lead_id", "reminder_text", "due_at", "status", "assigned_to", "metadata_json"}))
counts["intel_reminders"] = len(read_csv("intel_reminders.csv"))
for row in read_csv("intel_qd_scores.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_qd_scores (qd_id, person_id, score_type, current_value, computed_at, evidence_refs_json, metadata_json)
VALUES ($1::uuid, $2::uuid, $3, $4, COALESCE($5, NOW()), $6::jsonb, $7::jsonb)
ON CONFLICT (person_id, score_type) DO UPDATE SET current_value = EXCLUDED.current_value, computed_at = EXCLUDED.computed_at
""", stable_uuid("intel_qd_scores", row.get("qd_id")), person_id, row.get("score_type") or "intent_score", normalize_score(row.get("current_value")), safe_dt(row.get("computed_at")), json.dumps(safe_json(row.get("evidence_refs_json"), [])), metadata(row, source_id_key="qd_id", used={"qd_id", "person_id", "score_type", "current_value", "computed_at", "evidence_refs_json", "metadata_json"}))
counts["intel_qd_scores"] = len(read_csv("intel_qd_scores.csv"))
for row in read_csv("intel_qd_timeseries.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO intel_qd_timeseries (timeseries_id, person_id, score_type, signal_source, timestamp, value, evidence_ref, metadata_json)
VALUES ($1::uuid, $2::uuid, COALESCE($3, 'intent_score'), $3, COALESCE($4, NOW()), $5, $6, $7::jsonb)
ON CONFLICT (timeseries_id) DO UPDATE SET value = EXCLUDED.value, metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_qd_timeseries", row.get("timeseries_id")), person_id, row.get("signal_source") or "synthetic_signal", safe_dt(row.get("timestamp")), normalize_score(row.get("value")), row.get("evidence_ref"), metadata(row, source_id_key="timeseries_id", used={"timeseries_id", "person_id", "signal_source", "timestamp", "value", "evidence_ref", "metadata_json"}))
counts["intel_qd_timeseries"] = len(read_csv("intel_qd_timeseries.csv"))
# Evidence placeholders are loaded as metadata-rich rows.
for row in read_csv("intel_vehicle_events.csv"):
person_id = person_map.get(row.get("person_id", ""))
await maybe_execute(conn, dry_run, """
INSERT INTO intel_vehicle_events (event_id, person_id, zone, license_plate_hash, vehicle_class, wealth_indicator, cctv_ref, captured_at, metadata_json)
VALUES ($1::uuid, NULLIF($2, '')::uuid, $3, $4, $5, $6, $7, COALESCE($8, NOW()), $9::jsonb)
ON CONFLICT (event_id) DO UPDATE SET metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_vehicle_events", row.get("event_id")), person_id or "", row.get("location_ref"), row.get("vehicle_number"), row.get("event_type") or "unknown", "unknown", safe_json(row.get("metadata_json"), {}).get("camera_id"), safe_dt(row.get("detected_at")), metadata(row, source_id_key="event_id", used={"event_id", "person_id", "vehicle_number", "event_type", "detected_at", "location_ref", "confidence", "metadata_json"}))
counts["intel_vehicle_events"] = len(read_csv("intel_vehicle_events.csv"))
for row in read_csv("intel_perception_events.csv"):
person_id = person_map.get(row.get("person_id", ""))
await maybe_execute(conn, dry_run, """
INSERT INTO intel_perception_events (perception_id, person_id, session_ref, event_type, engagement_score, happened_at, metadata_json)
VALUES ($1::uuid, NULLIF($2, '')::uuid, $3, $4, $5, COALESCE($6, NOW()), $7::jsonb)
ON CONFLICT (perception_id) DO UPDATE SET metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_perception_events", row.get("event_id")), person_id or "", row.get("session_id"), row.get("event_type") or "perception", normalize_score(row.get("value")), safe_dt(row.get("detected_at")), metadata(row, source_id_key="event_id", used={"event_id", "person_id", "session_id", "event_type", "detected_at", "value", "metadata_json"}))
counts["intel_perception_events"] = len(read_csv("intel_perception_events.csv"))
visit_map = {r["visit_id"]: stable_uuid("intel_visits", r["visit_id"]) for r in read_csv("intel_visits.csv")}
for row in read_csv("intel_cctv_links.csv"):
await maybe_execute(conn, dry_run, """
INSERT INTO intel_cctv_links (link_id, visit_id, clip_ref, camera_zone, linked_at, metadata_json)
VALUES ($1::uuid, NULLIF($2, '')::uuid, $3, $4, COALESCE($5, NOW()), $6::jsonb)
ON CONFLICT (link_id) DO UPDATE SET metadata_json = EXCLUDED.metadata_json
""", stable_uuid("intel_cctv_links", row.get("link_id")), visit_map.get(row.get("visit_id", ""), ""), row.get("clip_ref"), row.get("camera_id"), safe_dt(row.get("recorded_at")), metadata(row, source_id_key="link_id", used={"link_id", "visit_id", "camera_id", "clip_ref", "recorded_at", "metadata_json"}))
counts["intel_cctv_links"] = len(read_csv("intel_cctv_links.csv"))
return counts
async def self_seed_workflow(conn: Any, dry_run: bool) -> dict[str, int]:
counts: dict[str, int] = {}
action_map = {r["action_id"]: stable_uuid("workflow_actions", r["action_id"]) for r in read_csv("workflow_actions.csv")}
valid_status = {"pending", "review_required", "approved", "rejected", "executed", "failed", "cancelled"}
for row in read_csv("workflow_actions.csv"):
status = row.get("status") if row.get("status") in valid_status else "pending"
await maybe_execute(conn, dry_run, """
INSERT INTO workflow_actions (action_id, action_type, target_domain, target_entity_ref, status, created_by_agent, proposal_payload, created_at)
VALUES ($1::uuid, $2, $3, $4, $5::wf_status, $6, $7::jsonb, COALESCE($8, NOW()))
ON CONFLICT (action_id) DO UPDATE SET status = EXCLUDED.status, proposal_payload = EXCLUDED.proposal_payload
""", action_map[row["action_id"]], row.get("action_type") or "enrichment", row.get("target_domain") or "crm", row.get("target_entity_ref"), status, row.get("created_by"), metadata(row, source_id_key="action_id", used={"action_id", "action_type", "target_domain", "target_entity_ref", "status", "created_by", "created_at", "metadata_json"}), safe_dt(row.get("created_at")))
counts["workflow_actions"] = len(read_csv("workflow_actions.csv"))
approval_map = {r["approval_id"]: stable_uuid("workflow_approvals", r["approval_id"]) for r in read_csv("workflow_approvals.csv")}
for row in read_csv("workflow_approvals.csv"):
action_id = action_map.get(row.get("action_id", ""))
if not action_id:
continue
await maybe_execute(conn, dry_run, """
INSERT INTO workflow_approvals (decision_id, action_id, decision, decision_notes, decided_at)
VALUES ($1::uuid, $2::uuid, $3, $4, COALESCE($5, NOW()))
ON CONFLICT (decision_id) DO UPDATE SET decision = EXCLUDED.decision, decision_notes = EXCLUDED.decision_notes
""", approval_map[row["approval_id"]], action_id, row.get("decision") or "approved", row.get("decision_notes"), safe_dt(row.get("decided_at")))
counts["workflow_approvals"] = len(read_csv("workflow_approvals.csv"))
for row in read_csv("workflow_writebacks.csv"):
action_id = action_map.get(row.get("proposal_ref", ""))
status = row.get("status") if row.get("status") in valid_status else "pending"
await maybe_execute(conn, dry_run, """
INSERT INTO workflow_writebacks (writeback_id, action_id, target_domain, target_entity_ref, status, executed_at, change_payload)
VALUES ($1::uuid, NULLIF($2, '')::uuid, $3, $4, $5::wf_status, $6, $7::jsonb)
ON CONFLICT (writeback_id) DO UPDATE SET status = EXCLUDED.status, change_payload = EXCLUDED.change_payload
""", stable_uuid("workflow_writebacks", row.get("writeback_id")), action_id or "", row.get("target_domain") or "crm", row.get("target_entity_ref") or "", status, safe_dt(row.get("executed_at")), json.dumps({"change_summary": row.get("change_summary"), "source": row}))
counts["workflow_writebacks"] = len(read_csv("workflow_writebacks.csv"))
return counts
async def self_seed_v2_enrichment(
conn: Any,
dry_run: bool,
person_map: dict[str, str],
interaction_map: dict[str, str],
) -> dict[str, int]:
counts: dict[str, int] = {}
for row in read_csv("intel_email_threads.csv"):
await maybe_execute(
conn, dry_run,
"""
INSERT INTO intel_email_threads (
thread_id, subject, first_email_at, last_email_at, email_count,
participants, status, broker_id, metadata_json
) VALUES ($1::uuid, $2, $3, $4, $5, $6::jsonb, $7, $8, $9::jsonb)
ON CONFLICT (thread_id) DO UPDATE SET subject = EXCLUDED.subject,
last_email_at = EXCLUDED.last_email_at, email_count = EXCLUDED.email_count,
participants = EXCLUDED.participants, status = EXCLUDED.status, broker_id = EXCLUDED.broker_id,
metadata_json = EXCLUDED.metadata_json
""",
stable_uuid("intel_email_threads", row.get("thread_id")),
row.get("subject"),
safe_dt(row.get("first_email_at")),
safe_dt(row.get("last_email_at")),
safe_int(row.get("email_count"), 0),
json.dumps(safe_json(row.get("participants"), [])),
row.get("status"),
row.get("broker_id"),
metadata(row, source_id_key="thread_id", used={"thread_id", "subject", "first_email_at", "last_email_at", "email_count", "participants", "status", "broker_id", "metadata_json"}),
)
counts["intel_email_threads"] = len(read_csv("intel_email_threads.csv"))
for row in read_csv("intel_extracted_facts.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO intel_extracted_facts (
fact_id, interaction_id, person_id, fact_type, fact_value, confidence,
extracted_from, source_context, extracted_at, metadata_json
) VALUES ($1::uuid, NULLIF($2, '')::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, $10::jsonb)
ON CONFLICT (fact_id) DO UPDATE SET fact_type = EXCLUDED.fact_type,
fact_value = EXCLUDED.fact_value, confidence = EXCLUDED.confidence,
source_context = EXCLUDED.source_context, metadata_json = EXCLUDED.metadata_json
""",
stable_uuid("intel_extracted_facts", row.get("fact_id")),
interaction_map.get(row.get("interaction_id", ""), ""),
person_id,
row.get("fact_type") or "note",
row.get("fact_value"),
safe_float(row.get("confidence")),
row.get("extracted_from"),
row.get("source_context"),
safe_dt(row.get("extracted_at")),
metadata(row, source_id_key="fact_id", used={"fact_id", "interaction_id", "person_id", "fact_type", "fact_value", "confidence", "extracted_from", "source_context", "extracted_at", "metadata_json"}),
)
counts["intel_extracted_facts"] = len(read_csv("intel_extracted_facts.csv"))
for row in read_csv("intel_call_objections.csv"):
await maybe_execute(
conn, dry_run,
"""
INSERT INTO intel_call_objections (
objection_id, call_id, objection_type, category, severity, status,
client_quote, agent_response, resolution_strategy, extracted_at,
confidence_score, metadata_json
) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb)
ON CONFLICT (objection_id) DO UPDATE SET objection_type = EXCLUDED.objection_type,
category = EXCLUDED.category, severity = EXCLUDED.severity, status = EXCLUDED.status,
client_quote = EXCLUDED.client_quote, agent_response = EXCLUDED.agent_response,
resolution_strategy = EXCLUDED.resolution_strategy, metadata_json = EXCLUDED.metadata_json
""",
stable_uuid("intel_call_objections", row.get("objection_id")),
stable_uuid("intel_calls", row.get("call_id")),
row.get("objection_type"),
row.get("category"),
row.get("severity"),
row.get("status"),
row.get("client_quote"),
row.get("agent_response"),
row.get("resolution_strategy"),
safe_dt(row.get("extracted_at")),
safe_float(row.get("confidence_score")),
metadata(row, source_id_key="objection_id", used={"objection_id", "call_id", "objection_type", "category", "severity", "status", "client_quote", "agent_response", "resolution_strategy", "extracted_at", "confidence_score", "metadata_json"}),
)
counts["intel_call_objections"] = len(read_csv("intel_call_objections.csv"))
for row in read_csv("read_last_contacted.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO read_last_contacted (
person_id, last_contact_at, last_channel, last_interaction_type,
days_since_contact, interactions_last_7d, interactions_last_30d,
interactions_last_90d, total_interactions, current_stage, broker_id,
broker_name, computed_at, metadata_json
) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14::jsonb)
ON CONFLICT (person_id) DO UPDATE SET last_contact_at = EXCLUDED.last_contact_at,
last_channel = EXCLUDED.last_channel, last_interaction_type = EXCLUDED.last_interaction_type,
days_since_contact = EXCLUDED.days_since_contact, total_interactions = EXCLUDED.total_interactions,
current_stage = EXCLUDED.current_stage, broker_id = EXCLUDED.broker_id,
broker_name = EXCLUDED.broker_name, computed_at = EXCLUDED.computed_at,
metadata_json = EXCLUDED.metadata_json
""",
person_id,
safe_dt(row.get("last_contact_at")),
row.get("last_channel"),
row.get("last_interaction_type"),
safe_int(row.get("days_since_contact")),
safe_int(row.get("interactions_last_7d")),
safe_int(row.get("interactions_last_30d")),
safe_int(row.get("interactions_last_90d")),
safe_int(row.get("total_interactions")),
row.get("current_stage"),
row.get("broker_id"),
row.get("broker_name"),
safe_dt(row.get("computed_at")),
metadata(row, source_id_key="person_id", used=set(row.keys())),
)
counts["read_last_contacted"] = len(read_csv("read_last_contacted.csv"))
for row in read_csv("read_next_best_action.csv"):
person_id = person_map.get(row.get("person_id", ""))
if not person_id:
continue
await maybe_execute(
conn, dry_run,
"""
INSERT INTO read_next_best_action (
person_id, recommended_action, priority, rationale, suggested_channel,
due_within_days, broker_id, broker_name, opportunity_context,
computed_at, metadata_json
) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11::jsonb)
ON CONFLICT (person_id) DO UPDATE SET recommended_action = EXCLUDED.recommended_action,
priority = EXCLUDED.priority, rationale = EXCLUDED.rationale,
suggested_channel = EXCLUDED.suggested_channel, due_within_days = EXCLUDED.due_within_days,
broker_id = EXCLUDED.broker_id, broker_name = EXCLUDED.broker_name,
opportunity_context = EXCLUDED.opportunity_context, computed_at = EXCLUDED.computed_at,
metadata_json = EXCLUDED.metadata_json
""",
person_id,
row.get("recommended_action"),
row.get("priority"),
row.get("rationale"),
row.get("suggested_channel"),
safe_int(row.get("due_within_days")),
row.get("broker_id"),
row.get("broker_name"),
row.get("opportunity_context"),
safe_dt(row.get("computed_at")),
metadata(row, source_id_key="person_id", used=set(row.keys())),
)
counts["read_next_best_action"] = len(read_csv("read_next_best_action.csv"))
return counts
def main() -> None:
parser = argparse.ArgumentParser(description="Seed canonical CRM tables from synthetic data CSVs")
parser.add_argument("--dry-run", action="store_true", help="Parse and validate without writing to DB")
parser.add_argument("--limit", type=int, default=None, help="Limit primary people/leads for testing")
args = parser.parse_args()
asyncio.run(seed(dry_run=args.dry_run, limit=args.limit))
if __name__ == "__main__":
main()