459 lines
18 KiB
Python
459 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
backend/scripts/seed_synthetic_crm.py
|
|
Seed the canonical CRM tables from the synthetic dataset CSVs.
|
|
|
|
Usage:
|
|
python -m backend.scripts.seed_synthetic_crm [--dry-run] [--limit N]
|
|
|
|
Reads from: db assets/synthetic_crm_v1/csv/
|
|
Writes to: canonical crm_*, intel_*, inventory_* tables
|
|
|
|
This script implements the import → canonical commit flow without going through
|
|
the HTTP import review UI — for initial database seeding only.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import csv
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
|
logger = logging.getLogger("velocity.seed")
|
|
|
|
# ── Data directory ────────────────────────────────────────────────────────────
|
|
REPO_ROOT = Path(__file__).parent.parent.parent
|
|
CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v1" / "csv"
|
|
|
|
|
|
def read_csv(filename: str) -> list[dict]:
|
|
path = CSV_DIR / filename
|
|
if not path.exists():
|
|
logger.warning("CSV not found: %s", path)
|
|
return []
|
|
with open(path, encoding="utf-8", newline="") as f:
|
|
return list(csv.DictReader(f))
|
|
|
|
|
|
def safe_float(val: str | None, default: float | None = None) -> float | None:
|
|
if not val or val.strip() in ("", "null", "None", "nan"):
|
|
return default
|
|
try:
|
|
return float(val)
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
|
|
def safe_int(val: str | None, default: int | None = None) -> int | None:
|
|
if not val or val.strip() in ("", "null", "None"):
|
|
return default
|
|
try:
|
|
return int(float(val))
|
|
except (ValueError, TypeError):
|
|
return default
|
|
|
|
|
|
def safe_dt(val: str | None) -> datetime | None:
|
|
if not val or val.strip() in ("", "null", "None"):
|
|
return None
|
|
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S.%f"):
|
|
try:
|
|
return datetime.strptime(val.strip(), fmt).replace(tzinfo=timezone.utc)
|
|
except ValueError:
|
|
continue
|
|
return None
|
|
|
|
|
|
async def seed(dry_run: bool = False, limit: int | None = None) -> None:
|
|
from backend.db.pool import create_pool, close_pool
|
|
|
|
logger.info("Connecting to database…")
|
|
pool = await create_pool()
|
|
|
|
async with pool.acquire() as conn:
|
|
# Verify canonical schema exists
|
|
exists = await conn.fetchval(
|
|
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'crm_people')"
|
|
)
|
|
if not exists:
|
|
logger.error("Canonical schema not found. Run schema_crm_canonical.sql first.")
|
|
return
|
|
|
|
# ── Phase 1: Inventory Projects ──────────────────────────────────────────
|
|
logger.info("[1/9] Seeding inventory_projects…")
|
|
projects_rows = read_csv("inventory_projects.csv")
|
|
project_name_to_id: dict[str, str] = {}
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in projects_rows:
|
|
pname = row.get("project_name", "").strip()
|
|
if not pname:
|
|
continue
|
|
pid = await conn.fetchval(
|
|
"SELECT project_id FROM inventory_projects WHERE project_name = $1",
|
|
pname,
|
|
)
|
|
if pid:
|
|
project_name_to_id[pname] = str(pid)
|
|
continue
|
|
pid = str(uuid.uuid4())
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO inventory_projects (project_id, project_name, developer_name, city, micro_market, created_at, updated_at)
|
|
VALUES ($1::uuid, $2, $3, $4, $5, NOW(), NOW())
|
|
ON CONFLICT (project_name) DO NOTHING
|
|
""",
|
|
pid,
|
|
pname,
|
|
row.get("developer_name", ""),
|
|
row.get("city", "Kolkata"),
|
|
row.get("micro_market", ""),
|
|
)
|
|
project_name_to_id[pname] = pid
|
|
logger.info(" → %d projects mapped", len(project_name_to_id))
|
|
|
|
# ── Phase 2: crm_people ──────────────────────────────────────────────────
|
|
logger.info("[2/9] Seeding crm_people…")
|
|
people_rows = read_csv("crm_people.csv")
|
|
if limit:
|
|
people_rows = people_rows[:limit]
|
|
|
|
person_id_map: dict[str, str] = {} # original CSV person_id → DB person_id
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in people_rows:
|
|
src_id = row.get("person_id", "")
|
|
full_name = row.get("full_name", "").strip()
|
|
if not full_name:
|
|
continue
|
|
|
|
new_id = str(uuid.uuid4())
|
|
persona_labels: list[str] = []
|
|
raw_labels = row.get("persona_labels", "")
|
|
if raw_labels.startswith("["):
|
|
try:
|
|
persona_labels = json.loads(raw_labels)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO crm_people (
|
|
person_id, full_name, primary_email, primary_phone,
|
|
buyer_type, persona_labels, source_confidence,
|
|
legacy_lead_id, metadata_json, created_at, updated_at
|
|
) VALUES (
|
|
$1::uuid, $2, $3, $4, $5, $6::jsonb, $7,
|
|
$8, $9::jsonb, NOW(), NOW()
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
new_id,
|
|
full_name,
|
|
row.get("primary_email") or None,
|
|
row.get("primary_phone") or None,
|
|
row.get("buyer_type") or None,
|
|
json.dumps(persona_labels),
|
|
safe_float(row.get("source_confidence"), 0.8),
|
|
src_id or None,
|
|
json.dumps({"synthetic": True, "source_id": src_id}),
|
|
)
|
|
person_id_map[src_id] = new_id
|
|
|
|
logger.info(" → %d people seeded", len(person_id_map))
|
|
|
|
# ── Phase 3: crm_leads ───────────────────────────────────────────────────
|
|
logger.info("[3/9] Seeding crm_leads…")
|
|
leads_rows = read_csv("crm_leads.csv")
|
|
lead_id_map: dict[str, str] = {}
|
|
|
|
VALID_STATUSES = {
|
|
'new', 'contacted', 'qualified', 'site_visit_scheduled', 'site_visited',
|
|
'negotiation', 'booking_initiated', 'booked', 'lost', 'dormant'
|
|
}
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in leads_rows:
|
|
src_person_id = row.get("person_id", "")
|
|
db_person_id = person_id_map.get(src_person_id)
|
|
if not db_person_id:
|
|
continue
|
|
|
|
src_lead_id = row.get("lead_id", "")
|
|
raw_status = row.get("status", "new").lower().strip()
|
|
status = raw_status if raw_status in VALID_STATUSES else "new"
|
|
|
|
new_lead_id = str(uuid.uuid4())
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO crm_leads (
|
|
lead_id, person_id, source_system, status,
|
|
budget_band, urgency, financing_posture,
|
|
timeline_to_decision, legacy_lead_id,
|
|
metadata_json, created_at, updated_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3, $4::crm_lead_status,
|
|
$5, $6, $7, $8, $9, $10::jsonb, NOW(), NOW()
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
new_lead_id,
|
|
db_person_id,
|
|
row.get("source_system", "csv_upload"),
|
|
status,
|
|
row.get("budget_band") or None,
|
|
row.get("urgency") or None,
|
|
row.get("financing_posture") or None,
|
|
row.get("timeline_to_decision") or None,
|
|
src_lead_id or None,
|
|
json.dumps({"synthetic": True, "source_lead_id": src_lead_id}),
|
|
)
|
|
lead_id_map[src_lead_id] = new_lead_id
|
|
|
|
logger.info(" → %d leads seeded", len(lead_id_map))
|
|
|
|
# ── Phase 4: crm_property_interests ─────────────────────────────────────
|
|
logger.info("[4/9] Seeding crm_property_interests…")
|
|
pi_rows = read_csv("crm_property_interests.csv")
|
|
seeded_pi = 0
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in pi_rows:
|
|
src_person_id = row.get("person_id", "")
|
|
db_person_id = person_id_map.get(src_person_id)
|
|
if not db_person_id:
|
|
continue
|
|
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
|
project_name = row.get("project_name", "").strip()
|
|
if not project_name:
|
|
continue
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO crm_property_interests (
|
|
interest_id, person_id, lead_id, project_name,
|
|
unit_preference, configuration, budget_min, budget_max, priority, created_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, NOW()
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
str(uuid.uuid4()),
|
|
db_person_id,
|
|
db_lead_id,
|
|
project_name,
|
|
row.get("unit_preference") or None,
|
|
row.get("configuration") or None,
|
|
safe_float(row.get("budget_min")),
|
|
safe_float(row.get("budget_max")),
|
|
safe_int(row.get("priority"), 1),
|
|
)
|
|
seeded_pi += 1
|
|
|
|
logger.info(" → %d property interests seeded", seeded_pi)
|
|
|
|
# ── Phase 5: intel_interactions ──────────────────────────────────────────
|
|
logger.info("[5/9] Seeding intel_interactions…")
|
|
int_rows = read_csv("intel_interactions.csv")
|
|
interaction_id_map: dict[str, str] = {}
|
|
|
|
VALID_CHANNELS = {
|
|
'whatsapp', 'phone', 'email', 'site_visit', 'office_meeting',
|
|
'video_call', 'cctv', 'perception_session', 'system'
|
|
}
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in int_rows:
|
|
src_person_id = row.get("person_id", "")
|
|
db_person_id = person_id_map.get(src_person_id)
|
|
if not db_person_id:
|
|
continue
|
|
|
|
raw_channel = row.get("channel", "system").lower().strip()
|
|
channel = raw_channel if raw_channel in VALID_CHANNELS else "system"
|
|
|
|
src_int_id = row.get("interaction_id", "")
|
|
new_int_id = str(uuid.uuid4())
|
|
|
|
happened_at = safe_dt(row.get("happened_at")) or datetime.now(timezone.utc)
|
|
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO intel_interactions (
|
|
interaction_id, person_id, lead_id, channel,
|
|
interaction_type, happened_at, summary, created_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3::uuid, $4::intel_channel,
|
|
$5, $6, $7, NOW()
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
new_int_id,
|
|
db_person_id,
|
|
db_lead_id,
|
|
channel,
|
|
row.get("interaction_type", "message"),
|
|
happened_at,
|
|
row.get("summary") or None,
|
|
)
|
|
interaction_id_map[src_int_id] = new_int_id
|
|
|
|
logger.info(" → %d interactions seeded", len(interaction_id_map))
|
|
|
|
# ── Phase 6: intel_qd_scores ─────────────────────────────────────────────
|
|
logger.info("[6/9] Seeding intel_qd_scores…")
|
|
qd_rows = read_csv("intel_qd_scores.csv")
|
|
seeded_qd = 0
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in qd_rows:
|
|
src_person_id = row.get("person_id", "")
|
|
db_person_id = person_id_map.get(src_person_id)
|
|
if not db_person_id:
|
|
continue
|
|
|
|
score_type = row.get("score_type", "intent_score")
|
|
current_value = safe_float(row.get("current_value"), 0.5)
|
|
if current_value is None:
|
|
continue
|
|
current_value = max(0.0, min(1.0, current_value))
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO intel_qd_scores (
|
|
qd_id, person_id, score_type, current_value,
|
|
reasoning, computed_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3, $4, $5, NOW()
|
|
)
|
|
ON CONFLICT (person_id, score_type) DO UPDATE
|
|
SET current_value = EXCLUDED.current_value,
|
|
computed_at = NOW()
|
|
""",
|
|
str(uuid.uuid4()),
|
|
db_person_id,
|
|
score_type,
|
|
current_value,
|
|
row.get("reasoning") or None,
|
|
)
|
|
seeded_qd += 1
|
|
|
|
logger.info(" → %d QD scores seeded", seeded_qd)
|
|
|
|
# ── Phase 7: intel_reminders ─────────────────────────────────────────────
|
|
logger.info("[7/9] Seeding intel_reminders…")
|
|
rem_rows = read_csv("intel_reminders.csv")
|
|
seeded_rem = 0
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in rem_rows:
|
|
src_person_id = row.get("person_id", "")
|
|
db_person_id = person_id_map.get(src_person_id)
|
|
if not db_person_id:
|
|
continue
|
|
title = row.get("title", "").strip()
|
|
if not title:
|
|
continue
|
|
|
|
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO intel_reminders (
|
|
reminder_id, person_id, lead_id, reminder_type, title, notes,
|
|
due_at, status, priority, created_by_type, created_at
|
|
) VALUES (
|
|
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6,
|
|
$7, $8, $9, 'system', NOW()
|
|
)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
str(uuid.uuid4()),
|
|
db_person_id,
|
|
db_lead_id,
|
|
row.get("reminder_type", "follow_up"),
|
|
title,
|
|
row.get("notes") or None,
|
|
safe_dt(row.get("due_at")),
|
|
row.get("status", "pending"),
|
|
row.get("priority", "normal"),
|
|
)
|
|
seeded_rem += 1
|
|
|
|
logger.info(" → %d reminders seeded", seeded_rem)
|
|
|
|
# ── Phase 8: crm_stage_history ───────────────────────────────────────────
|
|
logger.info("[8/9] Seeding crm_stage_history…")
|
|
hist_rows = read_csv("crm_stage_history.csv")
|
|
seeded_hist = 0
|
|
|
|
if not dry_run:
|
|
async with pool.acquire() as conn:
|
|
for row in hist_rows:
|
|
src_lead_id = row.get("lead_id", "")
|
|
db_lead_id = lead_id_map.get(src_lead_id)
|
|
if not db_lead_id:
|
|
continue
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO crm_stage_history (
|
|
history_id, lead_id, from_status, to_status, notes, happened_at
|
|
) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
str(uuid.uuid4()),
|
|
db_lead_id,
|
|
row.get("from_status") or None,
|
|
row.get("to_status", "new"),
|
|
row.get("notes") or None,
|
|
safe_dt(row.get("happened_at")) or datetime.now(timezone.utc),
|
|
)
|
|
seeded_hist += 1
|
|
|
|
logger.info(" → %d stage history records seeded", seeded_hist)
|
|
|
|
# ── Phase 9: Summary ─────────────────────────────────────────────────────
|
|
logger.info("[9/9] Seeding complete.")
|
|
logger.info(
|
|
"Summary: people=%d, leads=%d, interactions=%d, qd_scores=%d, reminders=%d, stage_history=%d",
|
|
len(person_id_map),
|
|
len(lead_id_map),
|
|
len(interaction_id_map),
|
|
seeded_qd,
|
|
seeded_rem,
|
|
seeded_hist,
|
|
)
|
|
if dry_run:
|
|
logger.info("DRY RUN — no data was written to the database.")
|
|
|
|
await close_pool()
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Seed canonical CRM tables from synthetic data CSVs")
|
|
parser.add_argument("--dry-run", action="store_true", help="Parse and validate without writing to DB")
|
|
parser.add_argument("--limit", type=int, default=None, help="Limit number of people to seed (for testing)")
|
|
args = parser.parse_args()
|
|
|
|
asyncio.run(seed(dry_run=args.dry_run, limit=args.limit))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|