- lock CORS to Vercel domain via ALLOWED_ORIGINS env var (removes allow_origins=*) - add X-API-Key header auth on /api/upload and /api/export - normalizer: add mapping confidence (high/inferred), new aliases for Acct #, Member ID, External Patient Ref, DME Description, dispensedate; 63/63 CSV files pass - coverage_calculator: add RULE_VERSION = "v0.1", rule_version on every CoverageResult - main.py: audit logging wired on upload + export, rule_version + mapping_summary in response - generate_samples.py: 25 CSV files now use 25 different real-world header formats - add generate_10k.py for 10,000-patient synthetic dataset - add tests/smoke_test.py (passes against local backend) - add docs/pilot-guide-v1.md for Robert Robinson pilot onboarding - add docs/daniel-pilot-readiness-whitepaper.md and .pdf Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
195 lines
8.1 KiB
Python
195 lines
8.1 KiB
Python
"""
|
|
Generate 25 CSV test files with VARIED headers simulating messy supplier exports.
|
|
|
|
Each file uses a different combination of column names, date formats, column order,
|
|
and payer strings — matching what real DME billing system exports look like.
|
|
The normalizer (normalizer.py) should successfully process all 25 files.
|
|
"""
|
|
|
|
import csv
|
|
import random
|
|
import os
|
|
from datetime import date, timedelta
|
|
|
|
random.seed(42)
|
|
|
|
DEVICE_TYPES = ["dexcom_g7", "dexcom_g6", "freestyle_libre_3", "omnipod_5"]
|
|
COMPONENTS = {"dexcom_g7": "sensor", "dexcom_g6": "sensor", "freestyle_libre_3": "sensor", "omnipod_5": "pod"}
|
|
COMPONENT_DISPLAY = {"sensor": "Sensor", "pod": "Pod"}
|
|
|
|
TODAY = date.today()
|
|
DATE_BUCKETS = {
|
|
"ok": (TODAY - timedelta(days=10), TODAY - timedelta(days=1)),
|
|
"visit_due": (TODAY - timedelta(days=400), TODAY - timedelta(days=250)),
|
|
"out_of_coverage": (TODAY - timedelta(days=600), TODAY - timedelta(days=500)),
|
|
"refill_window": (TODAY - timedelta(days=30), TODAY - timedelta(days=25)),
|
|
}
|
|
|
|
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# --- Header variation configs ---
|
|
# Each entry: (patient_id_col, device_col, date_col, qty_col, payer_col, component_col, date_fmt, extras)
|
|
|
|
HEADER_VARIANTS = [
|
|
# 1 — canonical
|
|
("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
|
|
"%Y-%m-%d", {}),
|
|
# 2 — Brightree-style
|
|
("Patient ID", "Item Description", "Service Date", "Qty", "Insurance Name", "Item Type",
|
|
"%m/%d/%Y", {"Prescriber NPI": "1234567890", "Branch": "PA-001"}),
|
|
# 3 — all caps, short
|
|
("PT_ID", "DEVICE", "SHIP DATE", "UNITS", "CARRIER", "TYPE",
|
|
"%Y-%m-%d", {}),
|
|
# 4 — MRN style + text date
|
|
("MRN", "Product Name", "Dispense Date", "Qty Dispensed", "Plan Name", "Supply Type",
|
|
"%d-%b-%Y", {"Supplier": "Gaboro DME", "State": "PA"}),
|
|
# 5 — account number + Medicaid payer strings
|
|
("Account Number", "Product", "Fill Date", "Count", "Primary Payer", "component",
|
|
"%m/%d/%y", {}),
|
|
# 6 — external ref + ISO datetime
|
|
("External Patient Ref", "Item", "Date of Service", "Qty Shipped", "Insurance", "item_type",
|
|
"%Y-%m-%dT%H:%M:%S", {"Notes": "batch export"}),
|
|
# 7 — Acct # abbreviation + YYYYMMDD
|
|
("Acct #", "DME", "Order Date", "quantity", "plan", "component_type",
|
|
"%Y%m%d", {}),
|
|
# 8 — patient + device type alternate
|
|
("patient", "device type", "last ship date", "units dispensed", "payer name", "type",
|
|
"%m/%d/%Y", {"Supplier Branch": "NY-003"}),
|
|
# 9 — pt_id + product_type + Medicaid variant
|
|
("pt_id", "product_type", "dos", "qty", "ins_name", "component",
|
|
"%Y-%m-%d", {}),
|
|
# 10 — account_no + hcpcs description
|
|
("account_no", "hcpcs_description", "service_date", "units", "primary_payer", "supply_type",
|
|
"%m-%d-%Y", {"HCPCS Code": "A9277"}),
|
|
# 11 — patient_account + commercial payer strings
|
|
("patient_account", "description", "ship_date", "quantity_dispensed", "carrier", "component",
|
|
"%Y-%m-%d", {"Account Manager": "J. Smith"}),
|
|
# 12 — id + product + Medicaid-GA
|
|
("id", "product", "fill_date", "qty_shipped", "payer", "item_type",
|
|
"%m/%d/%Y", {}),
|
|
# 13 — PT ID spaces + BCBS
|
|
("PT ID", "Device Type", "Shipment Date", "Quantity", "Insurance", "Component",
|
|
"%Y-%m-%d", {"Region": "Southeast"}),
|
|
# 14 — patientid (no space) + dispense date
|
|
("patientid", "devicetype", "dispensedate", "qty", "payername", "supplytype",
|
|
"%m/%d/%Y", {}),
|
|
# 15 — account_number + Aetna
|
|
("account_number", "item_description", "order_date", "units_dispensed", "plan_name", "component",
|
|
"%d/%m/%Y", {"Facility": "Gaboro PA Main"}),
|
|
# 16 — MRN + UHC + transmitter component
|
|
("MRN", "Product Type", "Service Date", "Qty", "Insurance Name", "Component Type",
|
|
"%Y-%m-%d", {}),
|
|
# 17 — mixed case + Humana
|
|
("Patient_ID", "Device", "Ship_Date", "Units", "Plan", "Type",
|
|
"%m/%d/%Y %H:%M:%S", {"Export Type": "CGM Only"}),
|
|
# 18 — patient id (space) + Cigna + extra cols
|
|
("patient id", "item", "dispense date", "count", "carrier", "supply_type",
|
|
"%Y-%m-%d", {"Billing Staff": "M. Jones", "Auth Number": "CGM-2026-001"}),
|
|
# 19 — Acct No + Anthem
|
|
("Acct No", "Product Name", "Last Ship Date", "Qty Dispensed", "Primary Payer", "Component",
|
|
"%b %d, %Y", {}),
|
|
# 20 — MEMBER ID style
|
|
("Member ID", "DME Description", "DOS", "QTY", "Insurance", "Item Type",
|
|
"%Y-%m-%d", {"Payer ID": "00019"}),
|
|
# 21 — pt id + CMS payer
|
|
("pt id", "device_type", "service date", "quantity", "payer", "component",
|
|
"%m/%d/%Y", {}),
|
|
# 22 — acct_no + Molina (Medicaid)
|
|
("acct_no", "product", "fill date", "units", "insurance name", "type",
|
|
"%Y-%m-%d", {"Branch Code": "GA-02"}),
|
|
# 23 — External Ref + WellCare (Medicaid) + timestamp
|
|
("External Patient Ref", "Item Description", "Dispense Date", "Quantity", "Plan Name", "Supply Type",
|
|
"%m/%d/%Y %H:%M:%S", {}),
|
|
# 24 — patient_id canonical + extra noise columns
|
|
("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
|
|
"%Y-%m-%d", {"Internal Code": "DME-99", "Region": "Northeast", "Staff ID": "STAFF-001"}),
|
|
# 25 — Acct # + Blue Cross + B %d, %Y date
|
|
("Acct #", "Device", "Order Date", "Qty", "Insurance", "Component",
|
|
"%B %d, %Y", {"Supplier Code": "STTIL-01"}),
|
|
]
|
|
|
|
PAYER_STRINGS = {
|
|
"medicare": [
|
|
"Medicare Part B", "Medicare", "CMS", "Medicare Part A", "Medicare Part B - CGM",
|
|
],
|
|
"medicaid": [
|
|
"Medicaid - GA", "Medicaid - PA", "Molina Healthcare", "WellCare", "Centene",
|
|
"Georgia Medicaid", "Medicaid",
|
|
],
|
|
"commercial": [
|
|
"BCBS - FL", "Blue Cross Blue Shield", "Aetna", "UnitedHealth", "UHC",
|
|
"Cigna", "Humana", "Anthem", "United Healthcare", "Aetna Commercial",
|
|
],
|
|
}
|
|
|
|
DEVICE_DISPLAY = {
|
|
"dexcom_g7": ["Dexcom G7", "G7", "Dexcom G7 CGM", "dexcom g7"],
|
|
"dexcom_g6": ["Dexcom G6", "G6", "Dexcom G6 Pro", "dexcom g6"],
|
|
"freestyle_libre_3": ["FreeStyle Libre 3", "Libre 3", "FSL3", "fs libre 3", "FreestyleLibre3"],
|
|
"omnipod_5": ["Omnipod 5", "Omnipod", "OmniPod 5", "op5"],
|
|
}
|
|
|
|
flags_assigned = random.choices(
|
|
["out_of_coverage", "visit_due", "refill_window", "ok"],
|
|
weights=[30, 25, 25, 20],
|
|
k=25,
|
|
)
|
|
|
|
|
|
def random_date(bucket):
|
|
start, end = bucket
|
|
delta = (end - start).days
|
|
return start + timedelta(days=random.randint(0, max(delta, 0)))
|
|
|
|
|
|
def format_date(d, fmt):
|
|
return d.strftime(fmt)
|
|
|
|
|
|
def random_payer_string(device):
|
|
payer_category = random.choices(
|
|
["medicare", "medicaid", "commercial"],
|
|
weights=[50, 20, 30],
|
|
)[0]
|
|
return random.choice(PAYER_STRINGS[payer_category])
|
|
|
|
|
|
for i, variant in enumerate(HEADER_VARIANTS, start=1):
|
|
pid_col, dev_col, date_col, qty_col, payer_col, comp_col, date_fmt, extras = variant
|
|
|
|
flag_key = flags_assigned[i - 1]
|
|
bucket = DATE_BUCKETS[flag_key]
|
|
|
|
device = random.choice(DEVICE_TYPES)
|
|
component = COMPONENTS[device]
|
|
payer_str = random_payer_string(device)
|
|
num_rows = random.randint(3, 8)
|
|
|
|
filename = f"sample-batch-{i:02d}-{flag_key}.csv"
|
|
filepath = os.path.join(OUTPUT_DIR, filename)
|
|
|
|
fieldnames = [pid_col, dev_col, date_col, qty_col, payer_col, comp_col] + list(extras.keys())
|
|
|
|
with open(filepath, "w", newline="") as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
for j in range(num_rows):
|
|
pid = f"PT-{1001 + (i - 1) * 10 + j}"
|
|
ship = random_date(bucket)
|
|
# add slight jitter
|
|
ship = ship + timedelta(days=random.randint(-3, 3))
|
|
row = {
|
|
pid_col: pid,
|
|
dev_col: random.choice(DEVICE_DISPLAY[device]),
|
|
date_col: format_date(ship, date_fmt),
|
|
qty_col: random.choice([1, 2, 3, 6, 9]),
|
|
payer_col: payer_str,
|
|
comp_col: component,
|
|
}
|
|
for k, v in extras.items():
|
|
row[k] = v
|
|
writer.writerow(row)
|
|
|
|
print(f"Wrote {filename} ({num_rows} rows, flag={flag_key}, headers: {pid_col}|{dev_col}|{date_col}|{payer_col})")
|
|
|
|
print(f"\nDone — 25 files in {OUTPUT_DIR}")
|