Signal/test-data/generate_10k.py
Kisa cf171a3f87 add Phase 1 security hardening, mapping confidence, audit logging, pilot docs
- lock CORS to Vercel domain via ALLOWED_ORIGINS env var (removes allow_origins=*)
- add X-API-Key header auth on /api/upload and /api/export
- normalizer: add mapping confidence (high/inferred), new aliases for Acct #,
  Member ID, External Patient Ref, DME Description, dispensedate; 63/63 CSV files pass
- coverage_calculator: add RULE_VERSION = "v0.1", rule_version on every CoverageResult
- main.py: audit logging wired on upload + export, rule_version + mapping_summary in response
- generate_samples.py: 25 CSV files now use 25 different real-world header formats
- add generate_10k.py for 10,000-patient synthetic dataset
- add tests/smoke_test.py (passes against local backend)
- add docs/pilot-guide-v1.md for Robert Robinson pilot onboarding
- add docs/daniel-pilot-readiness-whitepaper.md and .pdf

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 05:41:25 -04:00

84 lines
2.7 KiB
Python

"""
Generate a 10,000-row synthetic patient CSV for Signal volume testing.
Uses canonical headers and synthetic patient IDs (SYN-00001 through SYN-10000).
Realistic distribution across flags, payers, and devices.
Usage:
python3 test-data/generate_10k.py
"""
import csv
import random
from datetime import date, timedelta
from pathlib import Path
random.seed(99)
TODAY = date.today()
OUTPUT = Path(__file__).parent / "10k-patients.csv"
DEVICE_OPTIONS = [
("dexcom_g7", "sensor", 0.40),
("freestyle_libre_3", "sensor", 0.25),
("freestyle_libre_2", "sensor", 0.20),
("dexcom_g6", "sensor", 0.10),
("omnipod_5", "pod", 0.05),
]
PAYER_OPTIONS = [
("Medicare Part B", 0.50),
("Medicaid - GA", 0.10),
("Medicaid - PA", 0.10),
("BCBS - FL", 0.08),
("Aetna", 0.07),
("UnitedHealth", 0.06),
("Cigna", 0.05),
("Humana", 0.04),
]
FLAG_DATE_RANGES = [
("out_of_coverage", (TODAY - timedelta(days=600), TODAY - timedelta(days=400)), 0.30),
("visit_due", (TODAY - timedelta(days=400), TODAY - timedelta(days=250)), 0.25),
("refill_window", (TODAY - timedelta(days=30), TODAY - timedelta(days=20)), 0.20),
("ok", (TODAY - timedelta(days=10), TODAY - timedelta(days=1)), 0.25),
]
devices = [d[0] for d in DEVICE_OPTIONS]
dev_weights = [d[2] for d in DEVICE_OPTIONS]
dev_comp = {d[0]: d[1] for d in DEVICE_OPTIONS}
payers = [p[0] for p in PAYER_OPTIONS]
pay_weights = [p[1] for p in PAYER_OPTIONS]
flags = [f[0] for f in FLAG_DATE_RANGES]
flag_ranges = {f[0]: f[1] for f in FLAG_DATE_RANGES}
flag_weights= [f[2] for f in FLAG_DATE_RANGES]
def random_date_in(bucket):
start, end = bucket
delta = (end - start).days
return start + timedelta(days=random.randint(0, max(delta, 0)))
rows_written = 0
with open(OUTPUT, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["patient_id", "device_type", "shipment_date", "quantity", "payer", "component"])
for i in range(1, 10_001):
pid = f"SYN-{i:05d}"
device = random.choices(devices, weights=dev_weights)[0]
comp = dev_comp[device]
payer = random.choices(payers, weights=pay_weights)[0]
flag = random.choices(flags, weights=flag_weights)[0]
ship = random_date_in(flag_ranges[flag])
qty = random.choice([1, 2, 3, 6, 9, 14])
writer.writerow([pid, device, ship.isoformat(), qty, payer, comp])
rows_written += 1
print(f"Wrote {OUTPUT}")
print(f"Rows: {rows_written:,}")
print("Distribution targets: 30% Out of Coverage, 25% Visit Due, 20% Resupply Ready, 25% Active")