#!/usr/bin/env python3 """ backend/scripts/seed_synthetic_crm.py Seed the canonical CRM tables from the synthetic dataset CSVs. Usage: python -m backend.scripts.seed_synthetic_crm [--dry-run] [--limit N] Reads from: db assets/synthetic_crm_v1/csv/ Writes to: canonical crm_*, intel_*, inventory_* tables This script implements the import → canonical commit flow without going through the HTTP import review UI — for initial database seeding only. """ from __future__ import annotations import argparse import asyncio import csv import json import logging import os import sys import uuid from datetime import datetime, timezone from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger("velocity.seed") # ── Data directory ──────────────────────────────────────────────────────────── REPO_ROOT = Path(__file__).parent.parent.parent CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v1" / "csv" def read_csv(filename: str) -> list[dict]: path = CSV_DIR / filename if not path.exists(): logger.warning("CSV not found: %s", path) return [] with open(path, encoding="utf-8", newline="") as f: return list(csv.DictReader(f)) def safe_float(val: str | None, default: float | None = None) -> float | None: if not val or val.strip() in ("", "null", "None", "nan"): return default try: return float(val) except (ValueError, TypeError): return default def safe_int(val: str | None, default: int | None = None) -> int | None: if not val or val.strip() in ("", "null", "None"): return default try: return int(float(val)) except (ValueError, TypeError): return default def safe_dt(val: str | None) -> datetime | None: if not val or val.strip() in ("", "null", "None"): return None for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S.%f"): try: return datetime.strptime(val.strip(), fmt).replace(tzinfo=timezone.utc) except ValueError: continue return None async def seed(dry_run: bool = False, limit: int | None = None) -> None: from backend.db.pool import create_pool, close_pool logger.info("Connecting to database…") pool = await create_pool() async with pool.acquire() as conn: # Verify canonical schema exists exists = await conn.fetchval( "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'crm_people')" ) if not exists: logger.error("Canonical schema not found. Run schema_crm_canonical.sql first.") return # ── Phase 1: Inventory Projects ────────────────────────────────────────── logger.info("[1/9] Seeding inventory_projects…") projects_rows = read_csv("inventory_projects.csv") project_name_to_id: dict[str, str] = {} if not dry_run: async with pool.acquire() as conn: for row in projects_rows: pname = row.get("project_name", "").strip() if not pname: continue pid = await conn.fetchval( "SELECT project_id FROM inventory_projects WHERE project_name = $1", pname, ) if pid: project_name_to_id[pname] = str(pid) continue pid = str(uuid.uuid4()) await conn.execute( """ INSERT INTO inventory_projects (project_id, project_name, developer_name, city, micro_market, created_at, updated_at) VALUES ($1::uuid, $2, $3, $4, $5, NOW(), NOW()) ON CONFLICT (project_name) DO NOTHING """, pid, pname, row.get("developer_name", ""), row.get("city", "Kolkata"), row.get("micro_market", ""), ) project_name_to_id[pname] = pid logger.info(" → %d projects mapped", len(project_name_to_id)) # ── Phase 2: crm_people ────────────────────────────────────────────────── logger.info("[2/9] Seeding crm_people…") people_rows = read_csv("crm_people.csv") if limit: people_rows = people_rows[:limit] person_id_map: dict[str, str] = {} # original CSV person_id → DB person_id if not dry_run: async with pool.acquire() as conn: for row in people_rows: src_id = row.get("person_id", "") full_name = row.get("full_name", "").strip() if not full_name: continue new_id = str(uuid.uuid4()) persona_labels: list[str] = [] raw_labels = row.get("persona_labels", "") if raw_labels.startswith("["): try: persona_labels = json.loads(raw_labels) except json.JSONDecodeError: pass await conn.execute( """ INSERT INTO crm_people ( person_id, full_name, primary_email, primary_phone, buyer_type, persona_labels, source_confidence, legacy_lead_id, metadata_json, created_at, updated_at ) VALUES ( $1::uuid, $2, $3, $4, $5, $6::jsonb, $7, $8, $9::jsonb, NOW(), NOW() ) ON CONFLICT DO NOTHING """, new_id, full_name, row.get("primary_email") or None, row.get("primary_phone") or None, row.get("buyer_type") or None, json.dumps(persona_labels), safe_float(row.get("source_confidence"), 0.8), src_id or None, json.dumps({"synthetic": True, "source_id": src_id}), ) person_id_map[src_id] = new_id logger.info(" → %d people seeded", len(person_id_map)) # ── Phase 3: crm_leads ─────────────────────────────────────────────────── logger.info("[3/9] Seeding crm_leads…") leads_rows = read_csv("crm_leads.csv") lead_id_map: dict[str, str] = {} VALID_STATUSES = { 'new', 'contacted', 'qualified', 'site_visit_scheduled', 'site_visited', 'negotiation', 'booking_initiated', 'booked', 'lost', 'dormant' } if not dry_run: async with pool.acquire() as conn: for row in leads_rows: src_person_id = row.get("person_id", "") db_person_id = person_id_map.get(src_person_id) if not db_person_id: continue src_lead_id = row.get("lead_id", "") raw_status = row.get("status", "new").lower().strip() status = raw_status if raw_status in VALID_STATUSES else "new" new_lead_id = str(uuid.uuid4()) await conn.execute( """ INSERT INTO crm_leads ( lead_id, person_id, source_system, status, budget_band, urgency, financing_posture, timeline_to_decision, legacy_lead_id, metadata_json, created_at, updated_at ) VALUES ( $1::uuid, $2::uuid, $3, $4::crm_lead_status, $5, $6, $7, $8, $9, $10::jsonb, NOW(), NOW() ) ON CONFLICT DO NOTHING """, new_lead_id, db_person_id, row.get("source_system", "csv_upload"), status, row.get("budget_band") or None, row.get("urgency") or None, row.get("financing_posture") or None, row.get("timeline_to_decision") or None, src_lead_id or None, json.dumps({"synthetic": True, "source_lead_id": src_lead_id}), ) lead_id_map[src_lead_id] = new_lead_id logger.info(" → %d leads seeded", len(lead_id_map)) # ── Phase 4: crm_property_interests ───────────────────────────────────── logger.info("[4/9] Seeding crm_property_interests…") pi_rows = read_csv("crm_property_interests.csv") seeded_pi = 0 if not dry_run: async with pool.acquire() as conn: for row in pi_rows: src_person_id = row.get("person_id", "") db_person_id = person_id_map.get(src_person_id) if not db_person_id: continue db_lead_id = lead_id_map.get(row.get("lead_id", "")) project_name = row.get("project_name", "").strip() if not project_name: continue await conn.execute( """ INSERT INTO crm_property_interests ( interest_id, person_id, lead_id, project_name, unit_preference, configuration, budget_min, budget_max, priority, created_at ) VALUES ( $1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, NOW() ) ON CONFLICT DO NOTHING """, str(uuid.uuid4()), db_person_id, db_lead_id, project_name, row.get("unit_preference") or None, row.get("configuration") or None, safe_float(row.get("budget_min")), safe_float(row.get("budget_max")), safe_int(row.get("priority"), 1), ) seeded_pi += 1 logger.info(" → %d property interests seeded", seeded_pi) # ── Phase 5: intel_interactions ────────────────────────────────────────── logger.info("[5/9] Seeding intel_interactions…") int_rows = read_csv("intel_interactions.csv") interaction_id_map: dict[str, str] = {} VALID_CHANNELS = { 'whatsapp', 'phone', 'email', 'site_visit', 'office_meeting', 'video_call', 'cctv', 'perception_session', 'system' } if not dry_run: async with pool.acquire() as conn: for row in int_rows: src_person_id = row.get("person_id", "") db_person_id = person_id_map.get(src_person_id) if not db_person_id: continue raw_channel = row.get("channel", "system").lower().strip() channel = raw_channel if raw_channel in VALID_CHANNELS else "system" src_int_id = row.get("interaction_id", "") new_int_id = str(uuid.uuid4()) happened_at = safe_dt(row.get("happened_at")) or datetime.now(timezone.utc) db_lead_id = lead_id_map.get(row.get("lead_id", "")) await conn.execute( """ INSERT INTO intel_interactions ( interaction_id, person_id, lead_id, channel, interaction_type, happened_at, summary, created_at ) VALUES ( $1::uuid, $2::uuid, $3::uuid, $4::intel_channel, $5, $6, $7, NOW() ) ON CONFLICT DO NOTHING """, new_int_id, db_person_id, db_lead_id, channel, row.get("interaction_type", "message"), happened_at, row.get("summary") or None, ) interaction_id_map[src_int_id] = new_int_id logger.info(" → %d interactions seeded", len(interaction_id_map)) # ── Phase 6: intel_qd_scores ───────────────────────────────────────────── logger.info("[6/9] Seeding intel_qd_scores…") qd_rows = read_csv("intel_qd_scores.csv") seeded_qd = 0 if not dry_run: async with pool.acquire() as conn: for row in qd_rows: src_person_id = row.get("person_id", "") db_person_id = person_id_map.get(src_person_id) if not db_person_id: continue score_type = row.get("score_type", "intent_score") current_value = safe_float(row.get("current_value"), 0.5) if current_value is None: continue current_value = max(0.0, min(1.0, current_value)) await conn.execute( """ INSERT INTO intel_qd_scores ( qd_id, person_id, score_type, current_value, reasoning, computed_at ) VALUES ( $1::uuid, $2::uuid, $3, $4, $5, NOW() ) ON CONFLICT (person_id, score_type) DO UPDATE SET current_value = EXCLUDED.current_value, computed_at = NOW() """, str(uuid.uuid4()), db_person_id, score_type, current_value, row.get("reasoning") or None, ) seeded_qd += 1 logger.info(" → %d QD scores seeded", seeded_qd) # ── Phase 7: intel_reminders ───────────────────────────────────────────── logger.info("[7/9] Seeding intel_reminders…") rem_rows = read_csv("intel_reminders.csv") seeded_rem = 0 if not dry_run: async with pool.acquire() as conn: for row in rem_rows: src_person_id = row.get("person_id", "") db_person_id = person_id_map.get(src_person_id) if not db_person_id: continue title = row.get("title", "").strip() if not title: continue db_lead_id = lead_id_map.get(row.get("lead_id", "")) await conn.execute( """ INSERT INTO intel_reminders ( reminder_id, person_id, lead_id, reminder_type, title, notes, due_at, status, priority, created_by_type, created_at ) VALUES ( $1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, 'system', NOW() ) ON CONFLICT DO NOTHING """, str(uuid.uuid4()), db_person_id, db_lead_id, row.get("reminder_type", "follow_up"), title, row.get("notes") or None, safe_dt(row.get("due_at")), row.get("status", "pending"), row.get("priority", "normal"), ) seeded_rem += 1 logger.info(" → %d reminders seeded", seeded_rem) # ── Phase 8: crm_stage_history ─────────────────────────────────────────── logger.info("[8/9] Seeding crm_stage_history…") hist_rows = read_csv("crm_stage_history.csv") seeded_hist = 0 if not dry_run: async with pool.acquire() as conn: for row in hist_rows: src_lead_id = row.get("lead_id", "") db_lead_id = lead_id_map.get(src_lead_id) if not db_lead_id: continue await conn.execute( """ INSERT INTO crm_stage_history ( history_id, lead_id, from_status, to_status, notes, happened_at ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6) ON CONFLICT DO NOTHING """, str(uuid.uuid4()), db_lead_id, row.get("from_status") or None, row.get("to_status", "new"), row.get("notes") or None, safe_dt(row.get("happened_at")) or datetime.now(timezone.utc), ) seeded_hist += 1 logger.info(" → %d stage history records seeded", seeded_hist) # ── Phase 9: Summary ───────────────────────────────────────────────────── logger.info("[9/9] Seeding complete.") logger.info( "Summary: people=%d, leads=%d, interactions=%d, qd_scores=%d, reminders=%d, stage_history=%d", len(person_id_map), len(lead_id_map), len(interaction_id_map), seeded_qd, seeded_rem, seeded_hist, ) if dry_run: logger.info("DRY RUN — no data was written to the database.") await close_pool() def main() -> None: parser = argparse.ArgumentParser(description="Seed canonical CRM tables from synthetic data CSVs") parser.add_argument("--dry-run", action="store_true", help="Parse and validate without writing to DB") parser.add_argument("--limit", type=int, default=None, help="Limit number of people to seed (for testing)") args = parser.parse_args() asyncio.run(seed(dry_run=args.dry_run, limit=args.limit)) if __name__ == "__main__": main()