Project_Velocity/backend/services/imports/ingest_service.py

"""
backend/services/imports/ingest_service.py
CRM Import Ingestion Service

Implements the RawImportBatch → ImportMappingManifest → NormalizedEntityProposal pipeline
as specified in Doc 08 (Adapter Spec) and Doc 07 (Contracts and Schema Blueprint).

Flow:
  1. receive CSV upload, store raw batch record
  2. parse headers and infer column mapping
  3. validate row structure, detect unresolved columns
  4. create NormalizedEntityProposal records for review
  5. queue for human approval before canonical commit
"""
from __future__ import annotations

import csv
import io
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any

logger = logging.getLogger("velocity.imports.ingest")

# ── Column mapping heuristics ─────────────────────────────────────────────────
# Maps common source column names → canonical crm_people / crm_leads fields.

CANONICAL_COLUMN_MAP: dict[str, str] = {
    # Identity
    "name": "full_name",
    "full name": "full_name",
    "client name": "full_name",
    "contact name": "full_name",
    "first name": "full_name",
    "customer name": "full_name",
    # Email
    "email": "primary_email",
    "email address": "primary_email",
    "e-mail": "primary_email",
    # Phone
    "phone": "primary_phone",
    "mobile": "primary_phone",
    "contact number": "primary_phone",
    "mobile number": "primary_phone",
    "phone number": "primary_phone",
    # Budget
    "budget": "budget_band",
    "budget range": "budget_band",
    "investment budget": "budget_band",
    # Project interest
    "project": "project_name",
    "project name": "project_name",
    "interested in": "project_name",
    "property interest": "project_name",
    # Source
    "source": "source_system",
    "lead source": "source_system",
    "channel": "source_system",
    # Status / Stage
    "status": "status",
    "lead status": "status",
    "stage": "status",
    "funnel stage": "status",
    # Notes
    "notes": "notes",
    "remarks": "notes",
    "comment": "notes",
    "comments": "notes",
    # Buyer type
    "type": "buyer_type",
    "client type": "buyer_type",
    "category": "buyer_type",
}

REQUIRED_CANONICAL_FIELDS = {"full_name"}
HIGH_RISK_FIELDS = {"primary_email", "primary_phone"}


def _normalize_header(h: str) -> str:
    return h.strip().lower().replace("_", " ")


def infer_column_mapping(headers: list[str]) -> dict[str, Any]:
    """
    Produce an ImportMappingManifest-compatible mapping dict.
    Returns: {
        mapped: {source_col → canonical_field},
        unmapped: [source_col, ...],
        confidence: 0.0-1.0
    }
    """
    mapped: dict[str, str] = {}
    unmapped: list[str] = []

    for h in headers:
        normalized = _normalize_header(h)
        canonical = CANONICAL_COLUMN_MAP.get(normalized)
        if canonical:
            mapped[h] = canonical
        else:
            unmapped.append(h)

    mapped_count = len(mapped)
    total = len(headers)
    confidence = mapped_count / total if total > 0 else 0.0

    return {
        "mapped": mapped,
        "unmapped": unmapped,
        "mapped_count": mapped_count,
        "unmapped_count": len(unmapped),
        "confidence": round(confidence, 3),
    }


def parse_csv_content(content: str) -> dict[str, Any]:
    """
    Parse CSV content, detect headers, and extract rows.
    Returns: {headers, rows, row_count, parse_errors}
    """
    reader = csv.DictReader(io.StringIO(content))
    headers = reader.fieldnames or []
    rows: list[dict[str, Any]] = []
    parse_errors: list[str] = []

    for i, row in enumerate(reader):
        try:
            rows.append(dict(row))
        except Exception as e:
            parse_errors.append(f"Row {i + 2}: {str(e)}")

    return {
        "headers": list(headers),
        "rows": rows,
        "row_count": len(rows),
        "parse_errors": parse_errors,
    }


def build_normalized_proposals(
    rows: list[dict[str, Any]],
    mapping: dict[str, str],
    batch_id: str,
    source_system: str = "csv_upload",
) -> list[dict[str, Any]]:
    """
    Convert raw CSV rows to NormalizedEntityProposal payloads.
    One proposal per row — each must be approved before canonical commit.
    """
    proposals: list[dict[str, Any]] = []
    now = datetime.now(timezone.utc).isoformat()

    for i, row in enumerate(rows):
        canonical: dict[str, Any] = {}
        unresolved: list[str] = []
        confidence = 1.0

        for src_col, canonical_field in mapping.items():
            val = row.get(src_col, "").strip()
            if val:
                canonical[canonical_field] = val
            else:
                unresolved.append(src_col)

        # Validate required fields
        review_required = False
        missing_required = [f for f in REQUIRED_CANONICAL_FIELDS if not canonical.get(f)]
        if missing_required:
            review_required = True
            confidence = max(0.0, confidence - 0.4)

        # Flag high-risk fields (email/phone) if empty
        missing_high_risk = [f for f in HIGH_RISK_FIELDS if not canonical.get(f)]
        if missing_high_risk:
            confidence = max(0.0, confidence - 0.1 * len(missing_high_risk))

        proposal: dict[str, Any] = {
            "proposal_id": str(uuid.uuid4()),
            "batch_id": batch_id,
            "row_number": i + 2,
            "entity_type": "crm_person_with_lead",
            "canonical_payload": canonical,
            "raw_row": row,
            "unresolved_fields": unresolved,
            "missing_required": missing_required,
            "confidence": round(confidence, 3),
            "review_required": review_required,
            "status": "proposed",
            "created_at": now,
            "source_system": source_system,
        }
        proposals.append(proposal)

    return proposals


def create_import_batch_record(
    filename: str,
    row_count: int,
    mapping_manifest: dict[str, Any],
    source_system: str = "csv_upload",
    uploaded_by_id: str | None = None,
) -> dict[str, Any]:
    """
    Build the workflow_import_batches record payload.
    """
    now = datetime.now(timezone.utc).isoformat()
    return {
        "batch_id": str(uuid.uuid4()),
        "source_system": source_system,
        "uploaded_filename": filename,
        "mime_type": "text/csv",
        "row_count": row_count,
        "mapped_count": mapping_manifest.get("mapped_count", 0),
        "unresolved_count": mapping_manifest.get("unmapped_count", 0),
        "uploaded_by": uploaded_by_id,
        "lifecycle": "parsed",
        "mapping_manifest": mapping_manifest,
        "created_at": now,
        "updated_at": now,
    }


async def persist_import_batch(conn: Any, batch: dict[str, Any]) -> str:
    """
    Insert a workflow_import_batches row and return batch_id.
    """
    await conn.execute(
        """
        INSERT INTO workflow_import_batches (
            batch_id, source_system, uploaded_filename, mime_type, row_count,
            mapped_count, unresolved_count, uploaded_by, lifecycle, mapping_manifest,
            created_at, updated_at
        ) VALUES (
            $1::uuid, $2, $3, $4, $5, $6, $7,
            $8::uuid, $9::import_lifecycle, $10::jsonb, NOW(), NOW()
        )
        """,
        batch["batch_id"],
        batch["source_system"],
        batch.get("uploaded_filename", "unknown.csv"),
        batch.get("mime_type", "text/csv"),
        batch.get("row_count", 0),
        batch.get("mapped_count", 0),
        batch.get("unresolved_count", 0),
        batch.get("uploaded_by"),
        batch.get("lifecycle", "parsed"),
        json.dumps(batch.get("mapping_manifest", {})),
    )
    return batch["batch_id"]


async def persist_proposals_as_workflow_actions(
    conn: Any, proposals: list[dict[str, Any]]
) -> int:
    """
    Insert proposals into workflow_actions table for human review.
    Returns inserted count.
    """
    inserted = 0
    for p in proposals:
        await conn.execute(
            """
            INSERT INTO workflow_actions (
                action_id, action_type, target_domain, proposal_payload,
                reasoning_summary, confidence, status, approval_required,
                created_by_agent, created_at, updated_at
            ) VALUES (
                $1::uuid, 'import_proposal', 'crm', $2::jsonb,
                $3, $4, 'pending'::wf_status, $5, 'ingest_service', NOW(), NOW()
            )
            """,
            p["proposal_id"],
            json.dumps(p),
            f"Import row {p['row_number']}: {p['canonical_payload'].get('full_name', 'unknown')}",
            p["confidence"],
            p["review_required"],
        )
        inserted += 1
    return inserted