Project_Velocity/backend/scripts/seed_synthetic_crm.py

#!/usr/bin/env python3
"""
backend/scripts/seed_synthetic_crm.py
Seed the canonical CRM tables from the synthetic dataset CSVs.

Usage:
    python -m backend.scripts.seed_synthetic_crm [--dry-run] [--limit N]

Reads from: db assets/synthetic_crm_v1/csv/
Writes to: canonical crm_*, intel_*, inventory_* tables

This script implements the import → canonical commit flow without going through
the HTTP import review UI — for initial database seeding only.
"""
from __future__ import annotations

import argparse
import asyncio
import csv
import json
import logging
import os
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger("velocity.seed")

# ── Data directory ────────────────────────────────────────────────────────────
REPO_ROOT = Path(__file__).parent.parent.parent
CSV_DIR = REPO_ROOT / "db assets" / "synthetic_crm_v1" / "csv"


def read_csv(filename: str) -> list[dict]:
    path = CSV_DIR / filename
    if not path.exists():
        logger.warning("CSV not found: %s", path)
        return []
    with open(path, encoding="utf-8", newline="") as f:
        return list(csv.DictReader(f))


def safe_float(val: str | None, default: float | None = None) -> float | None:
    if not val or val.strip() in ("", "null", "None", "nan"):
        return default
    try:
        return float(val)
    except (ValueError, TypeError):
        return default


def safe_int(val: str | None, default: int | None = None) -> int | None:
    if not val or val.strip() in ("", "null", "None"):
        return default
    try:
        return int(float(val))
    except (ValueError, TypeError):
        return default


def safe_dt(val: str | None) -> datetime | None:
    if not val or val.strip() in ("", "null", "None"):
        return None
    for fmt in ("%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%Y-%m-%dT%H:%M:%S.%f"):
        try:
            return datetime.strptime(val.strip(), fmt).replace(tzinfo=timezone.utc)
        except ValueError:
            continue
    return None


async def seed(dry_run: bool = False, limit: int | None = None) -> None:
    from backend.db.pool import create_pool, close_pool

    logger.info("Connecting to database…")
    pool = await create_pool()

    async with pool.acquire() as conn:
        # Verify canonical schema exists
        exists = await conn.fetchval(
            "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'crm_people')"
        )
        if not exists:
            logger.error("Canonical schema not found. Run schema_crm_canonical.sql first.")
            return

    # ── Phase 1: Inventory Projects ──────────────────────────────────────────
    logger.info("[1/9] Seeding inventory_projects…")
    projects_rows = read_csv("inventory_projects.csv")
    project_name_to_id: dict[str, str] = {}

    if not dry_run:
        async with pool.acquire() as conn:
            for row in projects_rows:
                pname = row.get("project_name", "").strip()
                if not pname:
                    continue
                pid = await conn.fetchval(
                    "SELECT project_id FROM inventory_projects WHERE project_name = $1",
                    pname,
                )
                if pid:
                    project_name_to_id[pname] = str(pid)
                    continue
                pid = str(uuid.uuid4())
                await conn.execute(
                    """
                    INSERT INTO inventory_projects (project_id, project_name, developer_name, city, micro_market, created_at, updated_at)
                    VALUES ($1::uuid, $2, $3, $4, $5, NOW(), NOW())
                    ON CONFLICT (project_name) DO NOTHING
                    """,
                    pid,
                    pname,
                    row.get("developer_name", ""),
                    row.get("city", "Kolkata"),
                    row.get("micro_market", ""),
                )
                project_name_to_id[pname] = pid
    logger.info("  → %d projects mapped", len(project_name_to_id))

    # ── Phase 2: crm_people ──────────────────────────────────────────────────
    logger.info("[2/9] Seeding crm_people…")
    people_rows = read_csv("crm_people.csv")
    if limit:
        people_rows = people_rows[:limit]

    person_id_map: dict[str, str] = {}  # original CSV person_id → DB person_id

    if not dry_run:
        async with pool.acquire() as conn:
            for row in people_rows:
                src_id = row.get("person_id", "")
                full_name = row.get("full_name", "").strip()
                if not full_name:
                    continue

                new_id = str(uuid.uuid4())
                persona_labels: list[str] = []
                raw_labels = row.get("persona_labels", "")
                if raw_labels.startswith("["):
                    try:
                        persona_labels = json.loads(raw_labels)
                    except json.JSONDecodeError:
                        pass

                await conn.execute(
                    """
                    INSERT INTO crm_people (
                        person_id, full_name, primary_email, primary_phone,
                        buyer_type, persona_labels, source_confidence,
                        legacy_lead_id, metadata_json, created_at, updated_at
                    ) VALUES (
                        $1::uuid, $2, $3, $4, $5, $6::jsonb, $7,
                        $8, $9::jsonb, NOW(), NOW()
                    )
                    ON CONFLICT DO NOTHING
                    """,
                    new_id,
                    full_name,
                    row.get("primary_email") or None,
                    row.get("primary_phone") or None,
                    row.get("buyer_type") or None,
                    json.dumps(persona_labels),
                    safe_float(row.get("source_confidence"), 0.8),
                    src_id or None,
                    json.dumps({"synthetic": True, "source_id": src_id}),
                )
                person_id_map[src_id] = new_id

    logger.info("  → %d people seeded", len(person_id_map))

    # ── Phase 3: crm_leads ───────────────────────────────────────────────────
    logger.info("[3/9] Seeding crm_leads…")
    leads_rows = read_csv("crm_leads.csv")
    lead_id_map: dict[str, str] = {}

    VALID_STATUSES = {
        'new', 'contacted', 'qualified', 'site_visit_scheduled', 'site_visited',
        'negotiation', 'booking_initiated', 'booked', 'lost', 'dormant'
    }

    if not dry_run:
        async with pool.acquire() as conn:
            for row in leads_rows:
                src_person_id = row.get("person_id", "")
                db_person_id = person_id_map.get(src_person_id)
                if not db_person_id:
                    continue

                src_lead_id = row.get("lead_id", "")
                raw_status = row.get("status", "new").lower().strip()
                status = raw_status if raw_status in VALID_STATUSES else "new"

                new_lead_id = str(uuid.uuid4())
                await conn.execute(
                    """
                    INSERT INTO crm_leads (
                        lead_id, person_id, source_system, status,
                        budget_band, urgency, financing_posture,
                        timeline_to_decision, legacy_lead_id,
                        metadata_json, created_at, updated_at
                    ) VALUES (
                        $1::uuid, $2::uuid, $3, $4::crm_lead_status,
                        $5, $6, $7, $8, $9, $10::jsonb, NOW(), NOW()
                    )
                    ON CONFLICT DO NOTHING
                    """,
                    new_lead_id,
                    db_person_id,
                    row.get("source_system", "csv_upload"),
                    status,
                    row.get("budget_band") or None,
                    row.get("urgency") or None,
                    row.get("financing_posture") or None,
                    row.get("timeline_to_decision") or None,
                    src_lead_id or None,
                    json.dumps({"synthetic": True, "source_lead_id": src_lead_id}),
                )
                lead_id_map[src_lead_id] = new_lead_id

    logger.info("  → %d leads seeded", len(lead_id_map))

    # ── Phase 4: crm_property_interests ─────────────────────────────────────
    logger.info("[4/9] Seeding crm_property_interests…")
    pi_rows = read_csv("crm_property_interests.csv")
    seeded_pi = 0

    if not dry_run:
        async with pool.acquire() as conn:
            for row in pi_rows:
                src_person_id = row.get("person_id", "")
                db_person_id = person_id_map.get(src_person_id)
                if not db_person_id:
                    continue
                db_lead_id = lead_id_map.get(row.get("lead_id", ""))
                project_name = row.get("project_name", "").strip()
                if not project_name:
                    continue

                await conn.execute(
                    """
                    INSERT INTO crm_property_interests (
                        interest_id, person_id, lead_id, project_name,
                        unit_preference, configuration, budget_min, budget_max, priority, created_at
                    ) VALUES (
                        $1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, NOW()
                    )
                    ON CONFLICT DO NOTHING
                    """,
                    str(uuid.uuid4()),
                    db_person_id,
                    db_lead_id,
                    project_name,
                    row.get("unit_preference") or None,
                    row.get("configuration") or None,
                    safe_float(row.get("budget_min")),
                    safe_float(row.get("budget_max")),
                    safe_int(row.get("priority"), 1),
                )
                seeded_pi += 1

    logger.info("  → %d property interests seeded", seeded_pi)

    # ── Phase 5: intel_interactions ──────────────────────────────────────────
    logger.info("[5/9] Seeding intel_interactions…")
    int_rows = read_csv("intel_interactions.csv")
    interaction_id_map: dict[str, str] = {}

    VALID_CHANNELS = {
        'whatsapp', 'phone', 'email', 'site_visit', 'office_meeting',
        'video_call', 'cctv', 'perception_session', 'system'
    }

    if not dry_run:
        async with pool.acquire() as conn:
            for row in int_rows:
                src_person_id = row.get("person_id", "")
                db_person_id = person_id_map.get(src_person_id)
                if not db_person_id:
                    continue

                raw_channel = row.get("channel", "system").lower().strip()
                channel = raw_channel if raw_channel in VALID_CHANNELS else "system"

                src_int_id = row.get("interaction_id", "")
                new_int_id = str(uuid.uuid4())

                happened_at = safe_dt(row.get("happened_at")) or datetime.now(timezone.utc)
                db_lead_id = lead_id_map.get(row.get("lead_id", ""))

                await conn.execute(
                    """
                    INSERT INTO intel_interactions (
                        interaction_id, person_id, lead_id, channel,
                        interaction_type, happened_at, summary, created_at
                    ) VALUES (
                        $1::uuid, $2::uuid, $3::uuid, $4::intel_channel,
                        $5, $6, $7, NOW()
                    )
                    ON CONFLICT DO NOTHING
                    """,
                    new_int_id,
                    db_person_id,
                    db_lead_id,
                    channel,
                    row.get("interaction_type", "message"),
                    happened_at,
                    row.get("summary") or None,
                )
                interaction_id_map[src_int_id] = new_int_id

    logger.info("  → %d interactions seeded", len(interaction_id_map))

    # ── Phase 6: intel_qd_scores ─────────────────────────────────────────────
    logger.info("[6/9] Seeding intel_qd_scores…")
    qd_rows = read_csv("intel_qd_scores.csv")
    seeded_qd = 0

    if not dry_run:
        async with pool.acquire() as conn:
            for row in qd_rows:
                src_person_id = row.get("person_id", "")
                db_person_id = person_id_map.get(src_person_id)
                if not db_person_id:
                    continue

                score_type = row.get("score_type", "intent_score")
                current_value = safe_float(row.get("current_value"), 0.5)
                if current_value is None:
                    continue
                current_value = max(0.0, min(1.0, current_value))

                await conn.execute(
                    """
                    INSERT INTO intel_qd_scores (
                        qd_id, person_id, score_type, current_value,
                        reasoning, computed_at
                    ) VALUES (
                        $1::uuid, $2::uuid, $3, $4, $5, NOW()
                    )
                    ON CONFLICT (person_id, score_type) DO UPDATE
                    SET current_value = EXCLUDED.current_value,
                        computed_at = NOW()
                    """,
                    str(uuid.uuid4()),
                    db_person_id,
                    score_type,
                    current_value,
                    row.get("reasoning") or None,
                )
                seeded_qd += 1

    logger.info("  → %d QD scores seeded", seeded_qd)

    # ── Phase 7: intel_reminders ─────────────────────────────────────────────
    logger.info("[7/9] Seeding intel_reminders…")
    rem_rows = read_csv("intel_reminders.csv")
    seeded_rem = 0

    if not dry_run:
        async with pool.acquire() as conn:
            for row in rem_rows:
                src_person_id = row.get("person_id", "")
                db_person_id = person_id_map.get(src_person_id)
                if not db_person_id:
                    continue
                title = row.get("title", "").strip()
                if not title:
                    continue

                db_lead_id = lead_id_map.get(row.get("lead_id", ""))

                await conn.execute(
                    """
                    INSERT INTO intel_reminders (
                        reminder_id, person_id, lead_id, reminder_type, title, notes,
                        due_at, status, priority, created_by_type, created_at
                    ) VALUES (
                        $1::uuid, $2::uuid, $3::uuid, $4, $5, $6,
                        $7, $8, $9, 'system', NOW()
                    )
                    ON CONFLICT DO NOTHING
                    """,
                    str(uuid.uuid4()),
                    db_person_id,
                    db_lead_id,
                    row.get("reminder_type", "follow_up"),
                    title,
                    row.get("notes") or None,
                    safe_dt(row.get("due_at")),
                    row.get("status", "pending"),
                    row.get("priority", "normal"),
                )
                seeded_rem += 1

    logger.info("  → %d reminders seeded", seeded_rem)

    # ── Phase 8: crm_stage_history ───────────────────────────────────────────
    logger.info("[8/9] Seeding crm_stage_history…")
    hist_rows = read_csv("crm_stage_history.csv")
    seeded_hist = 0

    if not dry_run:
        async with pool.acquire() as conn:
            for row in hist_rows:
                src_lead_id = row.get("lead_id", "")
                db_lead_id = lead_id_map.get(src_lead_id)
                if not db_lead_id:
                    continue

                await conn.execute(
                    """
                    INSERT INTO crm_stage_history (
                        history_id, lead_id, from_status, to_status, notes, happened_at
                    ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6)
                    ON CONFLICT DO NOTHING
                    """,
                    str(uuid.uuid4()),
                    db_lead_id,
                    row.get("from_status") or None,
                    row.get("to_status", "new"),
                    row.get("notes") or None,
                    safe_dt(row.get("happened_at")) or datetime.now(timezone.utc),
                )
                seeded_hist += 1

    logger.info("  → %d stage history records seeded", seeded_hist)

    # ── Phase 9: Summary ─────────────────────────────────────────────────────
    logger.info("[9/9] Seeding complete.")
    logger.info(
        "Summary: people=%d, leads=%d, interactions=%d, qd_scores=%d, reminders=%d, stage_history=%d",
        len(person_id_map),
        len(lead_id_map),
        len(interaction_id_map),
        seeded_qd,
        seeded_rem,
        seeded_hist,
    )
    if dry_run:
        logger.info("DRY RUN — no data was written to the database.")

    await close_pool()


def main() -> None:
    parser = argparse.ArgumentParser(description="Seed canonical CRM tables from synthetic data CSVs")
    parser.add_argument("--dry-run", action="store_true", help="Parse and validate without writing to DB")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of people to seed (for testing)")
    args = parser.parse_args()

    asyncio.run(seed(dry_run=args.dry_run, limit=args.limit))


if __name__ == "__main__":
    main()