Signal/python-backend/api/normalizer.py

"""
CSV header normalization for Signal.

Maps messy supplier CSV exports to canonical ShipmentRecord fields.
Tolerates header drift, alternative column names, and common date formats.
"""

import csv
import io
import re
from datetime import date, datetime
from typing import Optional

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from core.coverage_calculator import ShipmentRecord

HEADER_MAP: dict[str, list[str]] = {
    "patient_id": [
        "patient_id", "patientid", "patient id", "pt_id", "pt id",
        "mrn", "account_number", "account number", "account_no",
        "patient_account", "acct_no", "acct no", "acct #", "acct#",
        "id", "patient", "member_id", "member id",
        "external patient ref", "external_patient_ref", "external ref",
    ],
    "device_type": [
        "device_type", "device type", "device", "devicetype",
        "product_type", "product type", "product", "item",
        "item_description", "item description", "hcpcs_description",
        "hcpcs description", "description", "product_name",
        "dme", "dme description", "dme_description", "dme desc",
        "equipment", "equipment description",
    ],
    "shipment_date": [
        "shipment_date", "shipment date", "ship_date", "ship date",
        "dispense_date", "dispense date", "dispensedate",
        "service_date", "service date",
        "order_date", "order date", "date_of_service", "dos",
        "fill_date", "fill date", "last_ship_date", "last ship date",
    ],
    "quantity": [
        "quantity", "qty", "units", "count", "qty_dispensed",
        "units_dispensed", "quantity_dispensed", "qty_shipped",
    ],
    "payer": [
        "payer", "insurance", "insurance_name", "insurance name",
        "plan", "plan_name", "plan name", "payer_name", "payer name",
        "primary_payer", "primary payer", "ins_name", "carrier",
    ],
    "component": [
        "component", "item_type", "component_type", "type", "supply_type",
    ],
}

DEVICE_MAP: dict[str, str] = {
    "dexcom g7": "dexcom_g7",
    "dexcom_g7": "dexcom_g7",
    "dexcomg7": "dexcom_g7",
    "g7": "dexcom_g7",
    "dexcom g6": "dexcom_g6",
    "dexcom_g6": "dexcom_g6",
    "dexcomg6": "dexcom_g6",
    "g6": "dexcom_g6",
    "freestyle libre 2": "freestyle_libre_2",
    "freestyle_libre_2": "freestyle_libre_2",
    "freestylelibre2": "freestyle_libre_2",
    "libre 2": "freestyle_libre_2",
    "libre2": "freestyle_libre_2",
    "fsl2": "freestyle_libre_2",
    "fs libre 2": "freestyle_libre_2",
    "freestyle libre 3": "freestyle_libre_3",
    "freestyle_libre_3": "freestyle_libre_3",
    "freestylelibre3": "freestyle_libre_3",
    "libre 3": "freestyle_libre_3",
    "libre3": "freestyle_libre_3",
    "fsl3": "freestyle_libre_3",
    "fs libre 3": "freestyle_libre_3",
    "omnipod 5": "omnipod_5",
    "omnipod_5": "omnipod_5",
    "omnipod5": "omnipod_5",
    "omnipod": "omnipod_5",
    "op5": "omnipod_5",
}

PAYER_MAP: dict[str, str] = {
    "medicare part b": "medicare",
    "medicare part a": "medicare",
    "medicare advantage": "commercial",
    "medicare": "medicare",
    "cms": "medicare",
    "medicaid": "medicaid",
    "mcd": "medicaid",
    "molina": "medicaid",
    "centene": "medicaid",
    "wellcare": "medicaid",
    "bcbs": "commercial",
    "blue cross": "commercial",
    "blue shield": "commercial",
    "aetna": "commercial",
    "cigna": "commercial",
    "unitedhealthcare": "commercial",
    "united health": "commercial",
    "uhc": "commercial",
    "humana": "commercial",
    "anthem": "commercial",
    "united": "commercial",
}

DATE_FORMATS = [
    "%Y-%m-%d",
    "%m/%d/%Y",
    "%m-%d-%Y",
    "%d/%m/%Y",
    "%m/%d/%y",
    "%Y%m%d",
    "%d-%b-%Y",
    "%b %d, %Y",
    "%B %d, %Y",
    "%m/%d/%Y %H:%M:%S",
    "%Y-%m-%dT%H:%M:%S",
]


def _normalize_key(s: str) -> str:
    return s.strip().lower().replace("-", " ").replace("_", " ")


def _map_header(raw: str) -> Optional[str]:
    key = _normalize_key(raw)
    for canonical, aliases in HEADER_MAP.items():
        if key in [_normalize_key(a) for a in aliases]:
            return canonical
    return None


def _map_header_with_confidence(raw: str) -> tuple[Optional[str], str]:
    """Return (canonical_field, confidence) where confidence is 'high' or 'inferred'."""
    key = _normalize_key(raw)
    for canonical, aliases in HEADER_MAP.items():
        if key == _normalize_key(canonical):
            return canonical, "high"
        if key in [_normalize_key(a) for a in aliases]:
            return canonical, "inferred"
    return None, "unmapped"


def _parse_date(value: str) -> Optional[date]:
    value = value.strip()
    for fmt in DATE_FORMATS:
        try:
            return datetime.strptime(value, fmt).date()
        except ValueError:
            continue
    return None


def _normalize_device(value: str) -> Optional[str]:
    key = _normalize_key(value)
    key_compact = re.sub(r"\s+", "", key)
    for alias, canonical in DEVICE_MAP.items():
        alias_compact = re.sub(r"\s+", "", alias)
        if key == alias or key_compact == alias_compact:
            return canonical
    return None


def _normalize_payer(value: str) -> str:
    key = _normalize_key(value)
    # Longest-match first (payer_map keys are already ordered longest first for medicare)
    for alias, canonical in PAYER_MAP.items():
        if alias in key:
            return canonical
    return "commercial"


def normalize_csv(text: str) -> tuple[list[ShipmentRecord], list[str], dict]:
    """
    Parse raw CSV text and return (records, skipped_reasons, mapping_summary).
    Tolerates header drift and normalizes device/payer/date values.

    mapping_summary format:
        {
          "mapped": {canonical_field: {"raw_header": str, "confidence": "high"|"inferred"}},
          "unmapped_columns": [str],
          "required_missing": [str],
        }
    """
    reader = csv.DictReader(io.StringIO(text.strip()))
    if not reader.fieldnames:
        return [], ["No headers found in file"], {}

    column_map: dict[str, str] = {}
    mapping_detail: dict[str, dict] = {}
    unmapped_columns: list[str] = []

    for raw_header in reader.fieldnames:
        canonical, confidence = _map_header_with_confidence(raw_header)
        if canonical:
            column_map[raw_header] = canonical
            mapping_detail[canonical] = {"raw_header": raw_header, "confidence": confidence}
        else:
            unmapped_columns.append(raw_header)

    required_fields = {"patient_id", "device_type", "shipment_date"}
    required_missing = [f for f in required_fields if f not in mapping_detail]

    mapping_summary = {
        "mapped": mapping_detail,
        "unmapped_columns": unmapped_columns,
        "required_missing": required_missing,
    }

    records: list[ShipmentRecord] = []
    skipped: list[str] = []

    for i, row in enumerate(reader, start=2):  # noqa: B007
        mapped: dict[str, str] = {}
        for raw_h, canonical in column_map.items():
            mapped[canonical] = (row.get(raw_h) or "").strip()

        patient_id = mapped.get("patient_id", "").strip()
        if not patient_id:
            skipped.append(f"Row {i}: missing patient_id")
            continue

        raw_device = mapped.get("device_type", "")
        device_type = _normalize_device(raw_device)
        if not device_type:
            skipped.append(f"Row {i} ({patient_id}): unrecognized device '{raw_device}'")
            continue

        raw_date = mapped.get("shipment_date", "")
        shipment_date = _parse_date(raw_date)
        if not shipment_date:
            skipped.append(f"Row {i} ({patient_id}): unparseable date '{raw_date}'")
            continue

        raw_qty = mapped.get("quantity", "1")
        try:
            quantity = max(1, int(float(raw_qty)))
        except (ValueError, TypeError):
            quantity = 1

        payer = _normalize_payer(mapped.get("payer", ""))
        component = (mapped.get("component", "sensor") or "sensor").lower().strip()
        if component not in ("sensor", "transmitter", "pod"):
            component = "sensor"

        records.append(ShipmentRecord(
            patient_id=patient_id,
            device_type=device_type,
            shipment_date=shipment_date,
            quantity=quantity,
            payer=payer,
            component=component,
        ))

    return records, skipped, mapping_summary