- lock CORS to Vercel domain via ALLOWED_ORIGINS env var (removes allow_origins=*) - add X-API-Key header auth on /api/upload and /api/export - normalizer: add mapping confidence (high/inferred), new aliases for Acct #, Member ID, External Patient Ref, DME Description, dispensedate; 63/63 CSV files pass - coverage_calculator: add RULE_VERSION = "v0.1", rule_version on every CoverageResult - main.py: audit logging wired on upload + export, rule_version + mapping_summary in response - generate_samples.py: 25 CSV files now use 25 different real-world header formats - add generate_10k.py for 10,000-patient synthetic dataset - add tests/smoke_test.py (passes against local backend) - add docs/pilot-guide-v1.md for Robert Robinson pilot onboarding - add docs/daniel-pilot-readiness-whitepaper.md and .pdf Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
"""
|
|
Generate a 10,000-row synthetic patient CSV for Signal volume testing.
|
|
|
|
Uses canonical headers and synthetic patient IDs (SYN-00001 through SYN-10000).
|
|
Realistic distribution across flags, payers, and devices.
|
|
|
|
Usage:
|
|
python3 test-data/generate_10k.py
|
|
"""
|
|
|
|
import csv
|
|
import random
|
|
from datetime import date, timedelta
|
|
from pathlib import Path
|
|
|
|
random.seed(99)
|
|
|
|
TODAY = date.today()
|
|
OUTPUT = Path(__file__).parent / "10k-patients.csv"
|
|
|
|
DEVICE_OPTIONS = [
|
|
("dexcom_g7", "sensor", 0.40),
|
|
("freestyle_libre_3", "sensor", 0.25),
|
|
("freestyle_libre_2", "sensor", 0.20),
|
|
("dexcom_g6", "sensor", 0.10),
|
|
("omnipod_5", "pod", 0.05),
|
|
]
|
|
|
|
PAYER_OPTIONS = [
|
|
("Medicare Part B", 0.50),
|
|
("Medicaid - GA", 0.10),
|
|
("Medicaid - PA", 0.10),
|
|
("BCBS - FL", 0.08),
|
|
("Aetna", 0.07),
|
|
("UnitedHealth", 0.06),
|
|
("Cigna", 0.05),
|
|
("Humana", 0.04),
|
|
]
|
|
|
|
FLAG_DATE_RANGES = [
|
|
("out_of_coverage", (TODAY - timedelta(days=600), TODAY - timedelta(days=400)), 0.30),
|
|
("visit_due", (TODAY - timedelta(days=400), TODAY - timedelta(days=250)), 0.25),
|
|
("refill_window", (TODAY - timedelta(days=30), TODAY - timedelta(days=20)), 0.20),
|
|
("ok", (TODAY - timedelta(days=10), TODAY - timedelta(days=1)), 0.25),
|
|
]
|
|
|
|
devices = [d[0] for d in DEVICE_OPTIONS]
|
|
dev_weights = [d[2] for d in DEVICE_OPTIONS]
|
|
dev_comp = {d[0]: d[1] for d in DEVICE_OPTIONS}
|
|
|
|
payers = [p[0] for p in PAYER_OPTIONS]
|
|
pay_weights = [p[1] for p in PAYER_OPTIONS]
|
|
|
|
flags = [f[0] for f in FLAG_DATE_RANGES]
|
|
flag_ranges = {f[0]: f[1] for f in FLAG_DATE_RANGES}
|
|
flag_weights= [f[2] for f in FLAG_DATE_RANGES]
|
|
|
|
|
|
def random_date_in(bucket):
|
|
start, end = bucket
|
|
delta = (end - start).days
|
|
return start + timedelta(days=random.randint(0, max(delta, 0)))
|
|
|
|
|
|
rows_written = 0
|
|
with open(OUTPUT, "w", newline="") as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(["patient_id", "device_type", "shipment_date", "quantity", "payer", "component"])
|
|
|
|
for i in range(1, 10_001):
|
|
pid = f"SYN-{i:05d}"
|
|
device = random.choices(devices, weights=dev_weights)[0]
|
|
comp = dev_comp[device]
|
|
payer = random.choices(payers, weights=pay_weights)[0]
|
|
flag = random.choices(flags, weights=flag_weights)[0]
|
|
ship = random_date_in(flag_ranges[flag])
|
|
qty = random.choice([1, 2, 3, 6, 9, 14])
|
|
|
|
writer.writerow([pid, device, ship.isoformat(), qty, payer, comp])
|
|
rows_written += 1
|
|
|
|
print(f"Wrote {OUTPUT}")
|
|
print(f"Rows: {rows_written:,}")
|
|
print("Distribution targets: 30% Out of Coverage, 25% Visit Due, 20% Resupply Ready, 25% Active")
|