feat(crm): canonical crm and imported routes implementation
This commit is contained in:
458
backend/scripts/seed_synthetic_crm.py
Normal file
458
backend/scripts/seed_synthetic_crm.py
Normal file
@@ -0,0 +1,458 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
backend/scripts/seed_synthetic_crm.py
|
||||
Seed the canonical CRM tables from the synthetic dataset CSVs.
|
||||
|
||||
Usage:
|
||||
python -m backend.scripts.seed_synthetic_crm [--dry-run] [--limit N]
|
||||
|
||||
Reads from: db assets/synthetic_crm_v1/csv/
|
||||
Writes to: canonical crm_*, intel_*, inventory_* tables
|
||||
|
||||
This script implements the import → canonical commit flow without going through
|
||||
the HTTP import review UI — for initial database seeding only.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
||||
logger = logging.getLogger("velocity.seed")
|
||||
|
||||
# ── Data directory ────────────────────────────────────────────────────────────
|
||||
REPO_ROOT = Path(__file__).parent.parent.parent
|
||||
CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v1" / "csv"
|
||||
|
||||
|
||||
def read_csv(filename: str) -> list[dict]:
|
||||
path = CSV_DIR / filename
|
||||
if not path.exists():
|
||||
logger.warning("CSV not found: %s", path)
|
||||
return []
|
||||
with open(path, encoding="utf-8", newline="") as f:
|
||||
return list(csv.DictReader(f))
|
||||
|
||||
|
||||
def safe_float(val: str | None, default: float | None = None) -> float | None:
|
||||
if not val or val.strip() in ("", "null", "None", "nan"):
|
||||
return default
|
||||
try:
|
||||
return float(val)
|
||||
except (ValueError, TypeError):
|
||||
return default
|
||||
|
||||
|
||||
def safe_int(val: str | None, default: int | None = None) -> int | None:
|
||||
if not val or val.strip() in ("", "null", "None"):
|
||||
return default
|
||||
try:
|
||||
return int(float(val))
|
||||
except (ValueError, TypeError):
|
||||
return default
|
||||
|
||||
|
||||
def safe_dt(val: str | None) -> datetime | None:
|
||||
if not val or val.strip() in ("", "null", "None"):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S.%f"):
|
||||
try:
|
||||
return datetime.strptime(val.strip(), fmt).replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
async def seed(dry_run: bool = False, limit: int | None = None) -> None:
|
||||
from backend.db.pool import create_pool, close_pool
|
||||
|
||||
logger.info("Connecting to database…")
|
||||
pool = await create_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Verify canonical schema exists
|
||||
exists = await conn.fetchval(
|
||||
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'crm_people')"
|
||||
)
|
||||
if not exists:
|
||||
logger.error("Canonical schema not found. Run schema_crm_canonical.sql first.")
|
||||
return
|
||||
|
||||
# ── Phase 1: Inventory Projects ──────────────────────────────────────────
|
||||
logger.info("[1/9] Seeding inventory_projects…")
|
||||
projects_rows = read_csv("inventory_projects.csv")
|
||||
project_name_to_id: dict[str, str] = {}
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in projects_rows:
|
||||
pname = row.get("project_name", "").strip()
|
||||
if not pname:
|
||||
continue
|
||||
pid = await conn.fetchval(
|
||||
"SELECT project_id FROM inventory_projects WHERE project_name = $1",
|
||||
pname,
|
||||
)
|
||||
if pid:
|
||||
project_name_to_id[pname] = str(pid)
|
||||
continue
|
||||
pid = str(uuid.uuid4())
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO inventory_projects (project_id, project_name, developer_name, city, micro_market, created_at, updated_at)
|
||||
VALUES ($1::uuid, $2, $3, $4, $5, NOW(), NOW())
|
||||
ON CONFLICT (project_name) DO NOTHING
|
||||
""",
|
||||
pid,
|
||||
pname,
|
||||
row.get("developer_name", ""),
|
||||
row.get("city", "Kolkata"),
|
||||
row.get("micro_market", ""),
|
||||
)
|
||||
project_name_to_id[pname] = pid
|
||||
logger.info(" → %d projects mapped", len(project_name_to_id))
|
||||
|
||||
# ── Phase 2: crm_people ──────────────────────────────────────────────────
|
||||
logger.info("[2/9] Seeding crm_people…")
|
||||
people_rows = read_csv("crm_people.csv")
|
||||
if limit:
|
||||
people_rows = people_rows[:limit]
|
||||
|
||||
person_id_map: dict[str, str] = {} # original CSV person_id → DB person_id
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in people_rows:
|
||||
src_id = row.get("person_id", "")
|
||||
full_name = row.get("full_name", "").strip()
|
||||
if not full_name:
|
||||
continue
|
||||
|
||||
new_id = str(uuid.uuid4())
|
||||
persona_labels: list[str] = []
|
||||
raw_labels = row.get("persona_labels", "")
|
||||
if raw_labels.startswith("["):
|
||||
try:
|
||||
persona_labels = json.loads(raw_labels)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO crm_people (
|
||||
person_id, full_name, primary_email, primary_phone,
|
||||
buyer_type, persona_labels, source_confidence,
|
||||
legacy_lead_id, metadata_json, created_at, updated_at
|
||||
) VALUES (
|
||||
$1::uuid, $2, $3, $4, $5, $6::jsonb, $7,
|
||||
$8, $9::jsonb, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
new_id,
|
||||
full_name,
|
||||
row.get("primary_email") or None,
|
||||
row.get("primary_phone") or None,
|
||||
row.get("buyer_type") or None,
|
||||
json.dumps(persona_labels),
|
||||
safe_float(row.get("source_confidence"), 0.8),
|
||||
src_id or None,
|
||||
json.dumps({"synthetic": True, "source_id": src_id}),
|
||||
)
|
||||
person_id_map[src_id] = new_id
|
||||
|
||||
logger.info(" → %d people seeded", len(person_id_map))
|
||||
|
||||
# ── Phase 3: crm_leads ───────────────────────────────────────────────────
|
||||
logger.info("[3/9] Seeding crm_leads…")
|
||||
leads_rows = read_csv("crm_leads.csv")
|
||||
lead_id_map: dict[str, str] = {}
|
||||
|
||||
VALID_STATUSES = {
|
||||
'new', 'contacted', 'qualified', 'site_visit_scheduled', 'site_visited',
|
||||
'negotiation', 'booking_initiated', 'booked', 'lost', 'dormant'
|
||||
}
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in leads_rows:
|
||||
src_person_id = row.get("person_id", "")
|
||||
db_person_id = person_id_map.get(src_person_id)
|
||||
if not db_person_id:
|
||||
continue
|
||||
|
||||
src_lead_id = row.get("lead_id", "")
|
||||
raw_status = row.get("status", "new").lower().strip()
|
||||
status = raw_status if raw_status in VALID_STATUSES else "new"
|
||||
|
||||
new_lead_id = str(uuid.uuid4())
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO crm_leads (
|
||||
lead_id, person_id, source_system, status,
|
||||
budget_band, urgency, financing_posture,
|
||||
timeline_to_decision, legacy_lead_id,
|
||||
metadata_json, created_at, updated_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3, $4::crm_lead_status,
|
||||
$5, $6, $7, $8, $9, $10::jsonb, NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
new_lead_id,
|
||||
db_person_id,
|
||||
row.get("source_system", "csv_upload"),
|
||||
status,
|
||||
row.get("budget_band") or None,
|
||||
row.get("urgency") or None,
|
||||
row.get("financing_posture") or None,
|
||||
row.get("timeline_to_decision") or None,
|
||||
src_lead_id or None,
|
||||
json.dumps({"synthetic": True, "source_lead_id": src_lead_id}),
|
||||
)
|
||||
lead_id_map[src_lead_id] = new_lead_id
|
||||
|
||||
logger.info(" → %d leads seeded", len(lead_id_map))
|
||||
|
||||
# ── Phase 4: crm_property_interests ─────────────────────────────────────
|
||||
logger.info("[4/9] Seeding crm_property_interests…")
|
||||
pi_rows = read_csv("crm_property_interests.csv")
|
||||
seeded_pi = 0
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in pi_rows:
|
||||
src_person_id = row.get("person_id", "")
|
||||
db_person_id = person_id_map.get(src_person_id)
|
||||
if not db_person_id:
|
||||
continue
|
||||
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
||||
project_name = row.get("project_name", "").strip()
|
||||
if not project_name:
|
||||
continue
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO crm_property_interests (
|
||||
interest_id, person_id, lead_id, project_name,
|
||||
unit_preference, configuration, budget_min, budget_max, priority, created_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, NOW()
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
str(uuid.uuid4()),
|
||||
db_person_id,
|
||||
db_lead_id,
|
||||
project_name,
|
||||
row.get("unit_preference") or None,
|
||||
row.get("configuration") or None,
|
||||
safe_float(row.get("budget_min")),
|
||||
safe_float(row.get("budget_max")),
|
||||
safe_int(row.get("priority"), 1),
|
||||
)
|
||||
seeded_pi += 1
|
||||
|
||||
logger.info(" → %d property interests seeded", seeded_pi)
|
||||
|
||||
# ── Phase 5: intel_interactions ──────────────────────────────────────────
|
||||
logger.info("[5/9] Seeding intel_interactions…")
|
||||
int_rows = read_csv("intel_interactions.csv")
|
||||
interaction_id_map: dict[str, str] = {}
|
||||
|
||||
VALID_CHANNELS = {
|
||||
'whatsapp', 'phone', 'email', 'site_visit', 'office_meeting',
|
||||
'video_call', 'cctv', 'perception_session', 'system'
|
||||
}
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in int_rows:
|
||||
src_person_id = row.get("person_id", "")
|
||||
db_person_id = person_id_map.get(src_person_id)
|
||||
if not db_person_id:
|
||||
continue
|
||||
|
||||
raw_channel = row.get("channel", "system").lower().strip()
|
||||
channel = raw_channel if raw_channel in VALID_CHANNELS else "system"
|
||||
|
||||
src_int_id = row.get("interaction_id", "")
|
||||
new_int_id = str(uuid.uuid4())
|
||||
|
||||
happened_at = safe_dt(row.get("happened_at")) or datetime.now(timezone.utc)
|
||||
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO intel_interactions (
|
||||
interaction_id, person_id, lead_id, channel,
|
||||
interaction_type, happened_at, summary, created_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4::intel_channel,
|
||||
$5, $6, $7, NOW()
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
new_int_id,
|
||||
db_person_id,
|
||||
db_lead_id,
|
||||
channel,
|
||||
row.get("interaction_type", "message"),
|
||||
happened_at,
|
||||
row.get("summary") or None,
|
||||
)
|
||||
interaction_id_map[src_int_id] = new_int_id
|
||||
|
||||
logger.info(" → %d interactions seeded", len(interaction_id_map))
|
||||
|
||||
# ── Phase 6: intel_qd_scores ─────────────────────────────────────────────
|
||||
logger.info("[6/9] Seeding intel_qd_scores…")
|
||||
qd_rows = read_csv("intel_qd_scores.csv")
|
||||
seeded_qd = 0
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in qd_rows:
|
||||
src_person_id = row.get("person_id", "")
|
||||
db_person_id = person_id_map.get(src_person_id)
|
||||
if not db_person_id:
|
||||
continue
|
||||
|
||||
score_type = row.get("score_type", "intent_score")
|
||||
current_value = safe_float(row.get("current_value"), 0.5)
|
||||
if current_value is None:
|
||||
continue
|
||||
current_value = max(0.0, min(1.0, current_value))
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO intel_qd_scores (
|
||||
qd_id, person_id, score_type, current_value,
|
||||
reasoning, computed_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3, $4, $5, NOW()
|
||||
)
|
||||
ON CONFLICT (person_id, score_type) DO UPDATE
|
||||
SET current_value = EXCLUDED.current_value,
|
||||
computed_at = NOW()
|
||||
""",
|
||||
str(uuid.uuid4()),
|
||||
db_person_id,
|
||||
score_type,
|
||||
current_value,
|
||||
row.get("reasoning") or None,
|
||||
)
|
||||
seeded_qd += 1
|
||||
|
||||
logger.info(" → %d QD scores seeded", seeded_qd)
|
||||
|
||||
# ── Phase 7: intel_reminders ─────────────────────────────────────────────
|
||||
logger.info("[7/9] Seeding intel_reminders…")
|
||||
rem_rows = read_csv("intel_reminders.csv")
|
||||
seeded_rem = 0
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in rem_rows:
|
||||
src_person_id = row.get("person_id", "")
|
||||
db_person_id = person_id_map.get(src_person_id)
|
||||
if not db_person_id:
|
||||
continue
|
||||
title = row.get("title", "").strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
db_lead_id = lead_id_map.get(row.get("lead_id", ""))
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO intel_reminders (
|
||||
reminder_id, person_id, lead_id, reminder_type, title, notes,
|
||||
due_at, status, priority, created_by_type, created_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4, $5, $6,
|
||||
$7, $8, $9, 'system', NOW()
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
str(uuid.uuid4()),
|
||||
db_person_id,
|
||||
db_lead_id,
|
||||
row.get("reminder_type", "follow_up"),
|
||||
title,
|
||||
row.get("notes") or None,
|
||||
safe_dt(row.get("due_at")),
|
||||
row.get("status", "pending"),
|
||||
row.get("priority", "normal"),
|
||||
)
|
||||
seeded_rem += 1
|
||||
|
||||
logger.info(" → %d reminders seeded", seeded_rem)
|
||||
|
||||
# ── Phase 8: crm_stage_history ───────────────────────────────────────────
|
||||
logger.info("[8/9] Seeding crm_stage_history…")
|
||||
hist_rows = read_csv("crm_stage_history.csv")
|
||||
seeded_hist = 0
|
||||
|
||||
if not dry_run:
|
||||
async with pool.acquire() as conn:
|
||||
for row in hist_rows:
|
||||
src_lead_id = row.get("lead_id", "")
|
||||
db_lead_id = lead_id_map.get(src_lead_id)
|
||||
if not db_lead_id:
|
||||
continue
|
||||
|
||||
await conn.execute(
|
||||
"""
|
||||
INSERT INTO crm_stage_history (
|
||||
history_id, lead_id, from_status, to_status, notes, happened_at
|
||||
) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
str(uuid.uuid4()),
|
||||
db_lead_id,
|
||||
row.get("from_status") or None,
|
||||
row.get("to_status", "new"),
|
||||
row.get("notes") or None,
|
||||
safe_dt(row.get("happened_at")) or datetime.now(timezone.utc),
|
||||
)
|
||||
seeded_hist += 1
|
||||
|
||||
logger.info(" → %d stage history records seeded", seeded_hist)
|
||||
|
||||
# ── Phase 9: Summary ─────────────────────────────────────────────────────
|
||||
logger.info("[9/9] Seeding complete.")
|
||||
logger.info(
|
||||
"Summary: people=%d, leads=%d, interactions=%d, qd_scores=%d, reminders=%d, stage_history=%d",
|
||||
len(person_id_map),
|
||||
len(lead_id_map),
|
||||
len(interaction_id_map),
|
||||
seeded_qd,
|
||||
seeded_rem,
|
||||
seeded_hist,
|
||||
)
|
||||
if dry_run:
|
||||
logger.info("DRY RUN — no data was written to the database.")
|
||||
|
||||
await close_pool()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Seed canonical CRM tables from synthetic data CSVs")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Parse and validate without writing to DB")
|
||||
parser.add_argument("--limit", type=int, default=None, help="Limit number of people to seed (for testing)")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(seed(dry_run=args.dry_run, limit=args.limit))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user