Signal/test-data/generate_samples.py
Kisa cf171a3f87 add Phase 1 security hardening, mapping confidence, audit logging, pilot docs
- lock CORS to Vercel domain via ALLOWED_ORIGINS env var (removes allow_origins=*)
- add X-API-Key header auth on /api/upload and /api/export
- normalizer: add mapping confidence (high/inferred), new aliases for Acct #,
  Member ID, External Patient Ref, DME Description, dispensedate; 63/63 CSV files pass
- coverage_calculator: add RULE_VERSION = "v0.1", rule_version on every CoverageResult
- main.py: audit logging wired on upload + export, rule_version + mapping_summary in response
- generate_samples.py: 25 CSV files now use 25 different real-world header formats
- add generate_10k.py for 10,000-patient synthetic dataset
- add tests/smoke_test.py (passes against local backend)
- add docs/pilot-guide-v1.md for Robert Robinson pilot onboarding
- add docs/daniel-pilot-readiness-whitepaper.md and .pdf

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 05:41:25 -04:00

195 lines
8.1 KiB
Python

"""
Generate 25 CSV test files with VARIED headers simulating messy supplier exports.
Each file uses a different combination of column names, date formats, column order,
and payer strings — matching what real DME billing system exports look like.
The normalizer (normalizer.py) should successfully process all 25 files.
"""
import csv
import random
import os
from datetime import date, timedelta
random.seed(42)
DEVICE_TYPES = ["dexcom_g7", "dexcom_g6", "freestyle_libre_3", "omnipod_5"]
COMPONENTS = {"dexcom_g7": "sensor", "dexcom_g6": "sensor", "freestyle_libre_3": "sensor", "omnipod_5": "pod"}
COMPONENT_DISPLAY = {"sensor": "Sensor", "pod": "Pod"}
TODAY = date.today()
DATE_BUCKETS = {
"ok": (TODAY - timedelta(days=10), TODAY - timedelta(days=1)),
"visit_due": (TODAY - timedelta(days=400), TODAY - timedelta(days=250)),
"out_of_coverage": (TODAY - timedelta(days=600), TODAY - timedelta(days=500)),
"refill_window": (TODAY - timedelta(days=30), TODAY - timedelta(days=25)),
}
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
# --- Header variation configs ---
# Each entry: (patient_id_col, device_col, date_col, qty_col, payer_col, component_col, date_fmt, extras)
HEADER_VARIANTS = [
# 1 — canonical
("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
"%Y-%m-%d", {}),
# 2 — Brightree-style
("Patient ID", "Item Description", "Service Date", "Qty", "Insurance Name", "Item Type",
"%m/%d/%Y", {"Prescriber NPI": "1234567890", "Branch": "PA-001"}),
# 3 — all caps, short
("PT_ID", "DEVICE", "SHIP DATE", "UNITS", "CARRIER", "TYPE",
"%Y-%m-%d", {}),
# 4 — MRN style + text date
("MRN", "Product Name", "Dispense Date", "Qty Dispensed", "Plan Name", "Supply Type",
"%d-%b-%Y", {"Supplier": "Gaboro DME", "State": "PA"}),
# 5 — account number + Medicaid payer strings
("Account Number", "Product", "Fill Date", "Count", "Primary Payer", "component",
"%m/%d/%y", {}),
# 6 — external ref + ISO datetime
("External Patient Ref", "Item", "Date of Service", "Qty Shipped", "Insurance", "item_type",
"%Y-%m-%dT%H:%M:%S", {"Notes": "batch export"}),
# 7 — Acct # abbreviation + YYYYMMDD
("Acct #", "DME", "Order Date", "quantity", "plan", "component_type",
"%Y%m%d", {}),
# 8 — patient + device type alternate
("patient", "device type", "last ship date", "units dispensed", "payer name", "type",
"%m/%d/%Y", {"Supplier Branch": "NY-003"}),
# 9 — pt_id + product_type + Medicaid variant
("pt_id", "product_type", "dos", "qty", "ins_name", "component",
"%Y-%m-%d", {}),
# 10 — account_no + hcpcs description
("account_no", "hcpcs_description", "service_date", "units", "primary_payer", "supply_type",
"%m-%d-%Y", {"HCPCS Code": "A9277"}),
# 11 — patient_account + commercial payer strings
("patient_account", "description", "ship_date", "quantity_dispensed", "carrier", "component",
"%Y-%m-%d", {"Account Manager": "J. Smith"}),
# 12 — id + product + Medicaid-GA
("id", "product", "fill_date", "qty_shipped", "payer", "item_type",
"%m/%d/%Y", {}),
# 13 — PT ID spaces + BCBS
("PT ID", "Device Type", "Shipment Date", "Quantity", "Insurance", "Component",
"%Y-%m-%d", {"Region": "Southeast"}),
# 14 — patientid (no space) + dispense date
("patientid", "devicetype", "dispensedate", "qty", "payername", "supplytype",
"%m/%d/%Y", {}),
# 15 — account_number + Aetna
("account_number", "item_description", "order_date", "units_dispensed", "plan_name", "component",
"%d/%m/%Y", {"Facility": "Gaboro PA Main"}),
# 16 — MRN + UHC + transmitter component
("MRN", "Product Type", "Service Date", "Qty", "Insurance Name", "Component Type",
"%Y-%m-%d", {}),
# 17 — mixed case + Humana
("Patient_ID", "Device", "Ship_Date", "Units", "Plan", "Type",
"%m/%d/%Y %H:%M:%S", {"Export Type": "CGM Only"}),
# 18 — patient id (space) + Cigna + extra cols
("patient id", "item", "dispense date", "count", "carrier", "supply_type",
"%Y-%m-%d", {"Billing Staff": "M. Jones", "Auth Number": "CGM-2026-001"}),
# 19 — Acct No + Anthem
("Acct No", "Product Name", "Last Ship Date", "Qty Dispensed", "Primary Payer", "Component",
"%b %d, %Y", {}),
# 20 — MEMBER ID style
("Member ID", "DME Description", "DOS", "QTY", "Insurance", "Item Type",
"%Y-%m-%d", {"Payer ID": "00019"}),
# 21 — pt id + CMS payer
("pt id", "device_type", "service date", "quantity", "payer", "component",
"%m/%d/%Y", {}),
# 22 — acct_no + Molina (Medicaid)
("acct_no", "product", "fill date", "units", "insurance name", "type",
"%Y-%m-%d", {"Branch Code": "GA-02"}),
# 23 — External Ref + WellCare (Medicaid) + timestamp
("External Patient Ref", "Item Description", "Dispense Date", "Quantity", "Plan Name", "Supply Type",
"%m/%d/%Y %H:%M:%S", {}),
# 24 — patient_id canonical + extra noise columns
("patient_id", "device_type", "shipment_date", "quantity", "payer", "component",
"%Y-%m-%d", {"Internal Code": "DME-99", "Region": "Northeast", "Staff ID": "STAFF-001"}),
# 25 — Acct # + Blue Cross + B %d, %Y date
("Acct #", "Device", "Order Date", "Qty", "Insurance", "Component",
"%B %d, %Y", {"Supplier Code": "STTIL-01"}),
]
PAYER_STRINGS = {
"medicare": [
"Medicare Part B", "Medicare", "CMS", "Medicare Part A", "Medicare Part B - CGM",
],
"medicaid": [
"Medicaid - GA", "Medicaid - PA", "Molina Healthcare", "WellCare", "Centene",
"Georgia Medicaid", "Medicaid",
],
"commercial": [
"BCBS - FL", "Blue Cross Blue Shield", "Aetna", "UnitedHealth", "UHC",
"Cigna", "Humana", "Anthem", "United Healthcare", "Aetna Commercial",
],
}
DEVICE_DISPLAY = {
"dexcom_g7": ["Dexcom G7", "G7", "Dexcom G7 CGM", "dexcom g7"],
"dexcom_g6": ["Dexcom G6", "G6", "Dexcom G6 Pro", "dexcom g6"],
"freestyle_libre_3": ["FreeStyle Libre 3", "Libre 3", "FSL3", "fs libre 3", "FreestyleLibre3"],
"omnipod_5": ["Omnipod 5", "Omnipod", "OmniPod 5", "op5"],
}
flags_assigned = random.choices(
["out_of_coverage", "visit_due", "refill_window", "ok"],
weights=[30, 25, 25, 20],
k=25,
)
def random_date(bucket):
start, end = bucket
delta = (end - start).days
return start + timedelta(days=random.randint(0, max(delta, 0)))
def format_date(d, fmt):
return d.strftime(fmt)
def random_payer_string(device):
payer_category = random.choices(
["medicare", "medicaid", "commercial"],
weights=[50, 20, 30],
)[0]
return random.choice(PAYER_STRINGS[payer_category])
for i, variant in enumerate(HEADER_VARIANTS, start=1):
pid_col, dev_col, date_col, qty_col, payer_col, comp_col, date_fmt, extras = variant
flag_key = flags_assigned[i - 1]
bucket = DATE_BUCKETS[flag_key]
device = random.choice(DEVICE_TYPES)
component = COMPONENTS[device]
payer_str = random_payer_string(device)
num_rows = random.randint(3, 8)
filename = f"sample-batch-{i:02d}-{flag_key}.csv"
filepath = os.path.join(OUTPUT_DIR, filename)
fieldnames = [pid_col, dev_col, date_col, qty_col, payer_col, comp_col] + list(extras.keys())
with open(filepath, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for j in range(num_rows):
pid = f"PT-{1001 + (i - 1) * 10 + j}"
ship = random_date(bucket)
# add slight jitter
ship = ship + timedelta(days=random.randint(-3, 3))
row = {
pid_col: pid,
dev_col: random.choice(DEVICE_DISPLAY[device]),
date_col: format_date(ship, date_fmt),
qty_col: random.choice([1, 2, 3, 6, 9]),
payer_col: payer_str,
comp_col: component,
}
for k, v in extras.items():
row[k] = v
writer.writerow(row)
print(f"Wrote {filename} ({num_rows} rows, flag={flag_key}, headers: {pid_col}|{dev_col}|{date_col}|{payer_col})")
print(f"\nDone — 25 files in {OUTPUT_DIR}")