Signal/test-data/generate_samples.py

"""
Generate 25 CSV test files with VARIED headers simulating messy supplier exports.

Each file uses a different combination of column names, date formats, column order,
and payer strings — matching what real DME billing system exports look like.
The normalizer (normalizer.py) should successfully process all 25 files.
"""

import csv
import random
import os
from datetime import date, timedelta

random.seed(42)

DEVICE_TYPES = ["dexcom_g7", "dexcom_g6", "freestyle_libre_3", "omnipod_5"]
COMPONENTS = {"dexcom_g7": "sensor", "dexcom_g6": "sensor", "freestyle_libre_3": "sensor", "omnipod_5": "pod"}
COMPONENT_DISPLAY = {"sensor": "Sensor", "pod": "Pod"}

TODAY = date.today()
DATE_BUCKETS = {
    "ok":              (TODAY - timedelta(days=10),  TODAY - timedelta(days=1)),
    "visit_due":       (TODAY - timedelta(days=400), TODAY - timedelta(days=250)),
    "out_of_coverage": (TODAY - timedelta(days=600), TODAY - timedelta(days=500)),
    "refill_window":   (TODAY - timedelta(days=30),  TODAY - timedelta(days=25)),
}

OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))

# --- Header variation configs ---
# Each entry: (patient_id_col, device_col, date_col, qty_col, payer_col, component_col, date_fmt, extras)

HEADER_VARIANTS = [
    # 1 — canonical
    ("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
     "%Y-%m-%d", {}),
    # 2 — Brightree-style
    ("Patient ID", "Item Description", "Service Date", "Qty", "Insurance Name", "Item Type",
     "%m/%d/%Y", {"Prescriber NPI": "1234567890", "Branch": "PA-001"}),
    # 3 — all caps, short
    ("PT_ID", "DEVICE", "SHIP DATE", "UNITS", "CARRIER", "TYPE",
     "%Y-%m-%d", {}),
    # 4 — MRN style + text date
    ("MRN", "Product Name", "Dispense Date", "Qty Dispensed", "Plan Name", "Supply Type",
     "%d-%b-%Y", {"Supplier": "Gaboro DME", "State": "PA"}),
    # 5 — account number + Medicaid payer strings
    ("Account Number", "Product", "Fill Date", "Count", "Primary Payer", "component",
     "%m/%d/%y", {}),
    # 6 — external ref + ISO datetime
    ("External Patient Ref", "Item", "Date of Service", "Qty Shipped", "Insurance", "item_type",
     "%Y-%m-%dT%H:%M:%S", {"Notes": "batch export"}),
    # 7 — Acct # abbreviation + YYYYMMDD
    ("Acct #", "DME", "Order Date", "quantity", "plan", "component_type",
     "%Y%m%d", {}),
    # 8 — patient + device type alternate
    ("patient", "device type", "last ship date", "units dispensed", "payer name", "type",
     "%m/%d/%Y", {"Supplier Branch": "NY-003"}),
    # 9 — pt_id + product_type + Medicaid variant
    ("pt_id", "product_type", "dos", "qty", "ins_name", "component",
     "%Y-%m-%d", {}),
    # 10 — account_no + hcpcs description
    ("account_no", "hcpcs_description", "service_date", "units", "primary_payer", "supply_type",
     "%m-%d-%Y", {"HCPCS Code": "A9277"}),
    # 11 — patient_account + commercial payer strings
    ("patient_account", "description", "ship_date", "quantity_dispensed", "carrier", "component",
     "%Y-%m-%d", {"Account Manager": "J. Smith"}),
    # 12 — id + product + Medicaid-GA
    ("id", "product", "fill_date", "qty_shipped", "payer", "item_type",
     "%m/%d/%Y", {}),
    # 13 — PT ID spaces + BCBS
    ("PT ID", "Device Type", "Shipment Date", "Quantity", "Insurance", "Component",
     "%Y-%m-%d", {"Region": "Southeast"}),
    # 14 — patientid (no space) + dispense date
    ("patientid", "devicetype", "dispensedate", "qty", "payername", "supplytype",
     "%m/%d/%Y", {}),
    # 15 — account_number + Aetna
    ("account_number", "item_description", "order_date", "units_dispensed", "plan_name", "component",
     "%d/%m/%Y", {"Facility": "Gaboro PA Main"}),
    # 16 — MRN + UHC + transmitter component
    ("MRN", "Product Type", "Service Date", "Qty", "Insurance Name", "Component Type",
     "%Y-%m-%d", {}),
    # 17 — mixed case + Humana
    ("Patient_ID", "Device", "Ship_Date", "Units", "Plan", "Type",
     "%m/%d/%Y %H:%M:%S", {"Export Type": "CGM Only"}),
    # 18 — patient id (space) + Cigna + extra cols
    ("patient id", "item", "dispense date", "count", "carrier", "supply_type",
     "%Y-%m-%d", {"Billing Staff": "M. Jones", "Auth Number": "CGM-2026-001"}),
    # 19 — Acct No + Anthem
    ("Acct No", "Product Name", "Last Ship Date", "Qty Dispensed", "Primary Payer", "Component",
     "%b %d, %Y", {}),
    # 20 — MEMBER ID style
    ("Member ID", "DME Description", "DOS", "QTY", "Insurance", "Item Type",
     "%Y-%m-%d", {"Payer ID": "00019"}),
    # 21 — pt id + CMS payer
    ("pt id", "device_type", "service date", "quantity", "payer", "component",
     "%m/%d/%Y", {}),
    # 22 — acct_no + Molina (Medicaid)
    ("acct_no", "product", "fill date", "units", "insurance name", "type",
     "%Y-%m-%d", {"Branch Code": "GA-02"}),
    # 23 — External Ref + WellCare (Medicaid) + timestamp
    ("External Patient Ref", "Item Description", "Dispense Date", "Quantity", "Plan Name", "Supply Type",
     "%m/%d/%Y %H:%M:%S", {}),
    # 24 — patient_id canonical + extra noise columns
    ("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
     "%Y-%m-%d", {"Internal Code": "DME-99", "Region": "Northeast", "Staff ID": "STAFF-001"}),
    # 25 — Acct # + Blue Cross + B %d, %Y date
    ("Acct #", "Device", "Order Date", "Qty", "Insurance", "Component",
     "%B %d, %Y", {"Supplier Code": "STTIL-01"}),
]

PAYER_STRINGS = {
    "medicare": [
        "Medicare Part B", "Medicare", "CMS", "Medicare Part A", "Medicare Part B - CGM",
    ],
    "medicaid": [
        "Medicaid - GA", "Medicaid - PA", "Molina Healthcare", "WellCare", "Centene",
        "Georgia Medicaid", "Medicaid",
    ],
    "commercial": [
        "BCBS - FL", "Blue Cross Blue Shield", "Aetna", "UnitedHealth", "UHC",
        "Cigna", "Humana", "Anthem", "United Healthcare", "Aetna Commercial",
    ],
}

DEVICE_DISPLAY = {
    "dexcom_g7":       ["Dexcom G7", "G7", "Dexcom G7 CGM", "dexcom g7"],
    "dexcom_g6":       ["Dexcom G6", "G6", "Dexcom G6 Pro", "dexcom g6"],
    "freestyle_libre_3": ["FreeStyle Libre 3", "Libre 3", "FSL3", "fs libre 3", "FreestyleLibre3"],
    "omnipod_5":       ["Omnipod 5", "Omnipod", "OmniPod 5", "op5"],
}

flags_assigned = random.choices(
    ["out_of_coverage", "visit_due", "refill_window", "ok"],
    weights=[30, 25, 25, 20],
    k=25,
)


def random_date(bucket):
    start, end = bucket
    delta = (end - start).days
    return start + timedelta(days=random.randint(0, max(delta, 0)))


def format_date(d, fmt):
    return d.strftime(fmt)


def random_payer_string(device):
    payer_category = random.choices(
        ["medicare", "medicaid", "commercial"],
        weights=[50, 20, 30],
    )[0]
    return random.choice(PAYER_STRINGS[payer_category])


for i, variant in enumerate(HEADER_VARIANTS, start=1):
    pid_col, dev_col, date_col, qty_col, payer_col, comp_col, date_fmt, extras = variant

    flag_key = flags_assigned[i - 1]
    bucket = DATE_BUCKETS[flag_key]

    device = random.choice(DEVICE_TYPES)
    component = COMPONENTS[device]
    payer_str = random_payer_string(device)
    num_rows = random.randint(3, 8)

    filename = f"sample-batch-{i:02d}-{flag_key}.csv"
    filepath = os.path.join(OUTPUT_DIR, filename)

    fieldnames = [pid_col, dev_col, date_col, qty_col, payer_col, comp_col] + list(extras.keys())

    with open(filepath, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for j in range(num_rows):
            pid = f"PT-{1001 + (i - 1) * 10 + j}"
            ship = random_date(bucket)
            # add slight jitter
            ship = ship + timedelta(days=random.randint(-3, 3))
            row = {
                pid_col:   pid,
                dev_col:   random.choice(DEVICE_DISPLAY[device]),
                date_col:  format_date(ship, date_fmt),
                qty_col:   random.choice([1, 2, 3, 6, 9]),
                payer_col: payer_str,
                comp_col:  component,
            }
            for k, v in extras.items():
                row[k] = v
            writer.writerow(row)

    print(f"Wrote {filename} ({num_rows} rows, flag={flag_key}, headers: {pid_col}|{dev_col}|{date_col}|{payer_col})")

print(f"\nDone — 25 files in {OUTPUT_DIR}")