Signal/python-backend/api/normalizer.py

"""
CSV header normalization for Signal.

Maps messy supplier CSV exports to canonical ShipmentRecord fields.
Tolerates header drift, alternative column names, and common date formats.
"""

import csv
import io
import re
from datetime import date, datetime
from typing import Optional

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from core.coverage_calculator import ShipmentRecord

HEADER_MAP: dict[str, list[str]] = {
    "patient_id": [
        "patient_id", "patientid", "patient id", "pt_id", "pt id",
        "mrn", "account_number", "account number", "account_no",
        "patient_account", "acct_no", "id", "patient",
    ],
    "device_type": [
        "device_type", "device type", "device", "devicetype",
        "product_type", "product type", "product", "item",
        "item_description", "item description", "hcpcs_description",
        "description", "product_name",
    ],
    "shipment_date": [
        "shipment_date", "shipment date", "ship_date", "ship date",
        "dispense_date", "dispense date", "service_date", "service date",
        "order_date", "order date", "date_of_service", "dos",
        "fill_date", "fill date", "last_ship_date", "last ship date",
    ],
    "quantity": [
        "quantity", "qty", "units", "count", "qty_dispensed",
        "units_dispensed", "quantity_dispensed", "qty_shipped",
    ],
    "payer": [
        "payer", "insurance", "insurance_name", "insurance name",
        "plan", "plan_name", "plan name", "payer_name", "payer name",
        "primary_payer", "primary payer", "ins_name", "carrier",
    ],
    "component": [
        "component", "item_type", "component_type", "type", "supply_type",
    ],
}

DEVICE_MAP: dict[str, str] = {
    "dexcom g7": "dexcom_g7",
    "dexcom_g7": "dexcom_g7",
    "dexcomg7": "dexcom_g7",
    "g7": "dexcom_g7",
    "dexcom g6": "dexcom_g6",
    "dexcom_g6": "dexcom_g6",
    "dexcomg6": "dexcom_g6",
    "g6": "dexcom_g6",
    "freestyle libre 2": "freestyle_libre_2",
    "freestyle_libre_2": "freestyle_libre_2",
    "freestylelibre2": "freestyle_libre_2",
    "libre 2": "freestyle_libre_2",
    "libre2": "freestyle_libre_2",
    "fsl2": "freestyle_libre_2",
    "fs libre 2": "freestyle_libre_2",
    "freestyle libre 3": "freestyle_libre_3",
    "freestyle_libre_3": "freestyle_libre_3",
    "freestylelibre3": "freestyle_libre_3",
    "libre 3": "freestyle_libre_3",
    "libre3": "freestyle_libre_3",
    "fsl3": "freestyle_libre_3",
    "fs libre 3": "freestyle_libre_3",
    "omnipod 5": "omnipod_5",
    "omnipod_5": "omnipod_5",
    "omnipod5": "omnipod_5",
    "omnipod": "omnipod_5",
    "op5": "omnipod_5",
}

PAYER_MAP: dict[str, str] = {
    "medicare part b": "medicare",
    "medicare part a": "medicare",
    "medicare advantage": "commercial",
    "medicare": "medicare",
    "cms": "medicare",
    "medicaid": "medicaid",
    "mcd": "medicaid",
    "molina": "medicaid",
    "centene": "medicaid",
    "wellcare": "medicaid",
    "bcbs": "commercial",
    "blue cross": "commercial",
    "blue shield": "commercial",
    "aetna": "commercial",
    "cigna": "commercial",
    "unitedhealthcare": "commercial",
    "united health": "commercial",
    "uhc": "commercial",
    "humana": "commercial",
    "anthem": "commercial",
    "united": "commercial",
}

DATE_FORMATS = [
    "%Y-%m-%d",
    "%m/%d/%Y",
    "%m-%d-%Y",
    "%d/%m/%Y",
    "%m/%d/%y",
    "%Y%m%d",
    "%d-%b-%Y",
    "%b %d, %Y",
    "%B %d, %Y",
    "%m/%d/%Y %H:%M:%S",
    "%Y-%m-%dT%H:%M:%S",
]


def _normalize_key(s: str) -> str:
    return s.strip().lower().replace("-", " ").replace("_", " ")


def _map_header(raw: str) -> Optional[str]:
    key = _normalize_key(raw)
    for canonical, aliases in HEADER_MAP.items():
        if key in [_normalize_key(a) for a in aliases]:
            return canonical
    return None


def _parse_date(value: str) -> Optional[date]:
    value = value.strip()
    for fmt in DATE_FORMATS:
        try:
            return datetime.strptime(value, fmt).date()
        except ValueError:
            continue
    return None


def _normalize_device(value: str) -> Optional[str]:
    key = _normalize_key(value)
    key_compact = re.sub(r"\s+", "", key)
    for alias, canonical in DEVICE_MAP.items():
        alias_compact = re.sub(r"\s+", "", alias)
        if key == alias or key_compact == alias_compact:
            return canonical
    return None


def _normalize_payer(value: str) -> str:
    key = _normalize_key(value)
    # Longest-match first (payer_map keys are already ordered longest first for medicare)
    for alias, canonical in PAYER_MAP.items():
        if alias in key:
            return canonical
    return "commercial"


def normalize_csv(text: str) -> tuple[list[ShipmentRecord], list[str]]:
    """
    Parse raw CSV text and return (records, skipped_reasons).
    Tolerates header drift and normalizes device/payer/date values.
    """
    reader = csv.DictReader(io.StringIO(text.strip()))
    if not reader.fieldnames:
        return [], ["No headers found in file"]

    column_map: dict[str, str] = {}
    for raw_header in reader.fieldnames:
        canonical = _map_header(raw_header)
        if canonical:
            column_map[raw_header] = canonical

    records: list[ShipmentRecord] = []
    skipped: list[str] = []

    for i, row in enumerate(reader, start=2):
        mapped: dict[str, str] = {}
        for raw_h, canonical in column_map.items():
            mapped[canonical] = (row.get(raw_h) or "").strip()

        patient_id = mapped.get("patient_id", "").strip()
        if not patient_id:
            skipped.append(f"Row {i}: missing patient_id")
            continue

        raw_device = mapped.get("device_type", "")
        device_type = _normalize_device(raw_device)
        if not device_type:
            skipped.append(f"Row {i} ({patient_id}): unrecognized device '{raw_device}'")
            continue

        raw_date = mapped.get("shipment_date", "")
        shipment_date = _parse_date(raw_date)
        if not shipment_date:
            skipped.append(f"Row {i} ({patient_id}): unparseable date '{raw_date}'")
            continue

        raw_qty = mapped.get("quantity", "1")
        try:
            quantity = max(1, int(float(raw_qty)))
        except (ValueError, TypeError):
            quantity = 1

        payer = _normalize_payer(mapped.get("payer", ""))
        component = (mapped.get("component", "sensor") or "sensor").lower().strip()
        if component not in ("sensor", "transmitter", "pod"):
            component = "sensor"

        records.append(ShipmentRecord(
            patient_id=patient_id,
            device_type=device_type,
            shipment_date=shipment_date,
            quantity=quantity,
            payer=payer,
            component=component,
        ))

    return records, skipped