Upload generate_elaad_test_data.py with huggingface_hub
Browse files- generate_elaad_test_data.py +155 -51
generate_elaad_test_data.py
CHANGED
|
@@ -2,12 +2,22 @@
|
|
| 2 |
generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
|
| 3 |
Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
|
| 4 |
|
| 5 |
-
FRAUD PATTERNS EMBEDDED
|
| 6 |
-
=======================
|
| 7 |
1. HCP level: Suspicious HCP with high one-and-done patient concentration
|
| 8 |
2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
|
| 9 |
3. Patient level: Patients with very short gap bursts or one-and-done patterns
|
| 10 |
4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
import os
|
|
@@ -23,9 +33,11 @@ N_HCPS = 40
|
|
| 23 |
N_PHARMACIES = 25
|
| 24 |
N_CLAIMS = 5000
|
| 25 |
|
| 26 |
-
# Trelegy NDCs
|
| 27 |
-
NDC_LIST = ["
|
| 28 |
NDC_PROB = [0.85, 0.15]
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Valid/suspicious specialties
|
| 31 |
VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
|
|
@@ -45,13 +57,38 @@ PLAN_PROBS = [0.20, 0.18, 0.15, 0.12, 0.10] + [0.05, 0.08, 0.04, 0.03, 0.05]
|
|
| 45 |
# Pharmacy subcategories
|
| 46 |
PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def generate_patient_profiles(n):
|
| 50 |
"""Generate patient-level profiles with fill count distributions."""
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
size=n, p=[0.10, 0.15, 0.20, 0.20, 0.15, 0.08, 0.05, 0.03, 0.02, 0.02])
|
| 54 |
-
# Embed fraud: ~5% patients are "one-and-done" (fraudulent singletons with suspicious patterns)
|
| 55 |
fraud_one_done = np.random.random(n) < 0.05
|
| 56 |
fill_counts = np.where(fraud_one_done, 1, fill_counts)
|
| 57 |
|
|
@@ -68,14 +105,12 @@ def generate_patient_profiles(n):
|
|
| 68 |
def generate_hcp_profiles(n):
|
| 69 |
"""Generate HCP profiles with fraud concentration."""
|
| 70 |
specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
|
| 71 |
-
# Embed fraud: ~10% of HCPs are suspicious (high one-and-done concentration)
|
| 72 |
fraud_hcp = np.random.random(n) < 0.10
|
| 73 |
hcps = pd.DataFrame({
|
| 74 |
"hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
|
| 75 |
"hcp_specialty": specialties,
|
| 76 |
"is_fraud_hcp": fraud_hcp,
|
| 77 |
})
|
| 78 |
-
# For fraud HCPs, override to suspicious specialty
|
| 79 |
hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
|
| 80 |
size=hcps["is_fraud_hcp"].sum())
|
| 81 |
return hcps
|
|
@@ -85,7 +120,6 @@ def generate_pharmacy_profiles(n):
|
|
| 85 |
"""Generate pharmacy profiles with fraud concentration."""
|
| 86 |
subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
|
| 87 |
states = np.random.choice(STATES, size=n)
|
| 88 |
-
# Embed fraud: ~8% of pharmacies are suspicious (high HCP concentration)
|
| 89 |
fraud_pharm = np.random.random(n) < 0.08
|
| 90 |
pharm = pd.DataFrame({
|
| 91 |
"pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
|
|
@@ -98,12 +132,11 @@ def generate_pharmacy_profiles(n):
|
|
| 98 |
|
| 99 |
|
| 100 |
def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
|
| 101 |
-
"""Generate transaction-level claims with embedded fraud patterns."""
|
| 102 |
records = []
|
| 103 |
claim_counter = 0
|
| 104 |
|
| 105 |
# Assign patients to HCPs (with concentration for fraud HCPs)
|
| 106 |
-
# Normal: patients see 1-2 HCPs. Fraud HCPs attract more patients with single fills.
|
| 107 |
for _, pat in patient_profiles.iterrows():
|
| 108 |
n_fills = pat["expected_fills"]
|
| 109 |
pat_id = pat["patient_id"]
|
|
@@ -112,18 +145,13 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 112 |
|
| 113 |
# Choose HCP(s)
|
| 114 |
if pat_fraud:
|
| 115 |
-
# Fraud patient: 80% chance assigned to a fraud HCP
|
| 116 |
fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
|
| 117 |
if len(fraud_hcps) > 0 and np.random.random() < 0.80:
|
| 118 |
hcp = fraud_hcps.sample(1).iloc[0]
|
| 119 |
else:
|
| 120 |
hcp = hcp_profiles.sample(1).iloc[0]
|
| 121 |
else:
|
| 122 |
-
|
| 123 |
-
if n_fills == 1:
|
| 124 |
-
hcp = hcp_profiles.sample(1).iloc[0]
|
| 125 |
-
else:
|
| 126 |
-
hcp = hcp_profiles.sample(1).iloc[0]
|
| 127 |
|
| 128 |
hcp_id = hcp["hcp_id"]
|
| 129 |
hcp_spec = hcp["hcp_specialty"]
|
|
@@ -131,19 +159,17 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 131 |
|
| 132 |
# Choose pharmacy
|
| 133 |
if pat_fraud or hcp_fraud:
|
| 134 |
-
# Fraud patterns: ~50% chance go to fraud pharmacy
|
| 135 |
fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
|
| 136 |
if len(fraud_pharm) > 0 and np.random.random() < 0.50:
|
| 137 |
pharm = fraud_pharm.sample(1).iloc[0]
|
| 138 |
else:
|
| 139 |
pharm = pharmacy_profiles.sample(1).iloc[0]
|
| 140 |
else:
|
| 141 |
-
# Normal: mostly retail/chain
|
| 142 |
normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
|
| 143 |
if len(normal_pharm) > 0:
|
| 144 |
pharm = normal_pharm.sample(1).iloc[0]
|
| 145 |
else:
|
| 146 |
-
pharm = pharmacy_profiles.sample(1).iloc[
|
| 147 |
|
| 148 |
pharm_id = pharm["pharmacy_id"]
|
| 149 |
pharm_state = pharm["pharmacy_state"]
|
|
@@ -153,9 +179,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 153 |
# Cross-state indicator
|
| 154 |
cross_state = (pat_state != pharm_state)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
# Generate fill dates
|
| 157 |
-
# Normal: 30-day intervals with some randomness
|
| 158 |
-
# Fraud: very short gaps or single fill
|
| 159 |
if n_fills == 1:
|
| 160 |
base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
|
| 161 |
fill_dates = [base_date]
|
|
@@ -163,7 +190,6 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 163 |
base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
|
| 164 |
fill_dates = [base_date]
|
| 165 |
for i in range(1, n_fills):
|
| 166 |
-
# Normal gap: 25-35 days. Fraud gap: 5-15 days (early refill)
|
| 167 |
if pat_fraud and np.random.random() < 0.6:
|
| 168 |
gap = np.random.randint(5, 16)
|
| 169 |
else:
|
|
@@ -176,7 +202,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 176 |
for idx, fill_date in enumerate(fill_dates):
|
| 177 |
claim_counter += 1
|
| 178 |
|
| 179 |
-
# Government insurance flag
|
| 180 |
if pat_fraud or hcp_fraud:
|
| 181 |
govt_prob = 0.25
|
| 182 |
else:
|
|
@@ -184,40 +210,82 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 184 |
is_govt = np.random.random() < govt_prob
|
| 185 |
plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# NDC
|
| 188 |
ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
|
| 189 |
# Fraud: occasionally switch NDC for fraud patients
|
| 190 |
if pat_fraud and idx > 0 and np.random.random() < 0.3:
|
| 191 |
ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
|
| 192 |
|
| 193 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
qty = 1
|
| 195 |
if pat_fraud and np.random.random() < 0.2:
|
| 196 |
qty = 2
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
# Days supply
|
| 199 |
ds = 30
|
| 200 |
if pat_fraud and np.random.random() < 0.15:
|
| 201 |
ds = 90
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
# Financials
|
| 204 |
usual_customary = np.random.uniform(500, 800)
|
| 205 |
copay_before = usual_customary
|
| 206 |
-
benefit = np.random.uniform(450, 650)
|
| 207 |
copay_after = max(0, copay_before - benefit)
|
| 208 |
-
|
|
|
|
| 209 |
if hcp_fraud:
|
| 210 |
benefit += np.random.uniform(50, 200)
|
| 211 |
copay_after = max(0, copay_before - benefit)
|
| 212 |
|
| 213 |
-
#
|
| 214 |
-
reject_code = None
|
| 215 |
-
if np.random.random() < 0.04:
|
| 216 |
-
reject_code = np.random.choice(["76", "88", "79"])
|
| 217 |
-
if pat_fraud and np.random.random() < 0.15:
|
| 218 |
-
reject_code = np.random.choice(["76", "88", "79"])
|
| 219 |
-
|
| 220 |
-
# Paper submission (fraud: higher for fraud patients)
|
| 221 |
paper_sub = 0
|
| 222 |
if pat_fraud and np.random.random() < 0.3:
|
| 223 |
paper_sub = 1
|
|
@@ -228,10 +296,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 228 |
# DAW code
|
| 229 |
daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
|
| 230 |
|
| 231 |
-
# Claim type
|
| 232 |
claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
|
| 233 |
if pat_fraud and np.random.random() < 0.1:
|
| 234 |
-
claim_type = "A"
|
| 235 |
|
| 236 |
# Card ID
|
| 237 |
card_id = f"EL_C{np.random.randint(100000, 999999)}"
|
|
@@ -239,37 +307,66 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 239 |
# RX number
|
| 240 |
rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
|
| 241 |
|
| 242 |
-
# Date written
|
| 243 |
date_written = fill_date - timedelta(days=np.random.randint(0, 6))
|
| 244 |
|
| 245 |
# BIN/PCN
|
| 246 |
bin_num = f"{np.random.randint(100000, 999999)}"
|
| 247 |
pcn = f"PCN{np.random.randint(10, 99)}"
|
| 248 |
|
| 249 |
-
#
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
-
# Is this claim fraudulent?
|
| 253 |
is_fraud = 0
|
| 254 |
fraud_type = ""
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
| 256 |
is_fraud = 1
|
| 257 |
-
fraud_type = "
|
| 258 |
-
|
|
|
|
| 259 |
is_fraud = 1
|
| 260 |
-
fraud_type = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
elif is_govt:
|
| 262 |
is_fraud = 1
|
| 263 |
fraud_type = "govt_insurance"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
elif reject_code in ["76", "88"]:
|
| 265 |
is_fraud = 1
|
| 266 |
fraud_type = "reject_fraud"
|
|
|
|
| 267 |
elif cross_state and np.random.random() < 0.3:
|
| 268 |
is_fraud = 1
|
| 269 |
fraud_type = "cross_state"
|
|
|
|
| 270 |
elif qty != 1 or ds != 30:
|
| 271 |
is_fraud = 1
|
| 272 |
fraud_type = "quantity_anomaly"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
record = {
|
| 275 |
# Patient
|
|
@@ -315,8 +412,11 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 315 |
"SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
|
| 316 |
"SUBMISSION_TYPE": "NEW",
|
| 317 |
"PAYMENT_METHOD": "ACH",
|
|
|
|
|
|
|
|
|
|
| 318 |
# Program
|
| 319 |
-
"
|
| 320 |
# Metadata
|
| 321 |
"FILE_NAME": "elaad_test_data_2024.txt",
|
| 322 |
# Ground truth
|
|
@@ -334,7 +434,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
|
|
| 334 |
def main():
|
| 335 |
os.makedirs("data", exist_ok=True)
|
| 336 |
|
| 337 |
-
print("Generating ELAAD-style synthetic test data...")
|
| 338 |
print(f" Patients: {N_PATIENTS}")
|
| 339 |
print(f" HCPs: {N_HCPS}")
|
| 340 |
print(f" Pharmacies: {N_PHARMACIES}")
|
|
@@ -348,9 +448,13 @@ def main():
|
|
| 348 |
# Summary
|
| 349 |
fraud_rate = df["is_fraud"].mean()
|
| 350 |
fraud_types = df["fraud_type"].value_counts().to_dict()
|
|
|
|
|
|
|
| 351 |
print(f"\nGenerated {len(df)} claims")
|
| 352 |
print(f" Fraud rate: {fraud_rate:.2%}")
|
| 353 |
print(f" Fraud types: {fraud_types}")
|
|
|
|
|
|
|
| 354 |
print(f" Patients: {df['PATIENT_ID'].nunique()}")
|
| 355 |
print(f" HCPs: {df['HCP_ID'].nunique()}")
|
| 356 |
print(f" Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")
|
|
|
|
| 2 |
generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
|
| 3 |
Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
|
| 4 |
|
| 5 |
+
FRAUD PATTERNS EMBEDDED (v4 Group-Aware)
|
| 6 |
+
========================================
|
| 7 |
1. HCP level: Suspicious HCP with high one-and-done patient concentration
|
| 8 |
2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
|
| 9 |
3. Patient level: Patients with very short gap bursts or one-and-done patterns
|
| 10 |
4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
|
| 11 |
+
5. GROUP-AWARE FRAUD (NEW):
|
| 12 |
+
- Group 8200 cash claims (not covered in 2025 design)
|
| 13 |
+
- Group 8200 commercial rejected claims (not covered in 2025 design)
|
| 14 |
+
- Group 8141 claims with benefit > cap for scenario+days supply
|
| 15 |
+
- Group 8141 using $500/30d cap outside Jan-Mar 2024
|
| 16 |
+
- Annual fill count > 12 per patient per year
|
| 17 |
+
- Annual days supply > 360 per patient per year
|
| 18 |
+
- Non-covered NDCs (not 00173088710 or 00173089310)
|
| 19 |
+
- Government insurance claims receiving GSK benefit
|
| 20 |
+
- Max benefit repeat (patient+pharmacy hitting cap >=3 times)
|
| 21 |
"""
|
| 22 |
|
| 23 |
import os
|
|
|
|
| 33 |
N_PHARMACIES = 25
|
| 34 |
N_CLAIMS = 5000
|
| 35 |
|
| 36 |
+
# Trelegy NDCs (canonical 11-digit)
|
| 37 |
+
NDC_LIST = ["00173089310", "00173088710"]
|
| 38 |
NDC_PROB = [0.85, 0.15]
|
| 39 |
+
# Non-covered NDC (for fraud embedding)
|
| 40 |
+
NON_COVERED_NDCS = ["00173089999", "00173088888"]
|
| 41 |
|
| 42 |
# Valid/suspicious specialties
|
| 43 |
VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
|
|
|
|
| 57 |
# Pharmacy subcategories
|
| 58 |
PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
|
| 59 |
|
| 60 |
+
# Benefit caps for generating realistic benefit amounts
|
| 61 |
+
GROUP_8141_CASH_CAP = {30: 100, 60: 200, 90: 300}
|
| 62 |
+
GROUP_8141_COMM_APPROVED_CAP_PRE_2024 = {30: 200, 60: 400, 90: 600}
|
| 63 |
+
GROUP_8141_COMM_APPROVED_CAP_ELEVATED = {30: 500, 60: 1000, 90: 1500} # Jan-Mar 2024
|
| 64 |
+
GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024 = {30: 200, 60: 400, 90: 600}
|
| 65 |
+
GROUP_8141_COMM_REJECTED_CAP = {30: 100, 60: 200, 90: 300}
|
| 66 |
+
GROUP_8200_COMM_APPROVED_CAP = {30: 645, 60: 1290, 90: 1935}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def get_cap_for_claim(group_id, scenario, fill_date, days_supply):
|
| 70 |
+
"""Get the benefit cap for a given group/scenario/date/ds combination."""
|
| 71 |
+
ds_key = 30 if days_supply <= 30 else (60 if days_supply <= 60 else 90)
|
| 72 |
+
if group_id == "8141":
|
| 73 |
+
if scenario in ["cash", "commercial_rejected"]:
|
| 74 |
+
return GROUP_8141_CASH_CAP[ds_key]
|
| 75 |
+
if scenario == "commercial_approved":
|
| 76 |
+
if fill_date < datetime(2024, 1, 1):
|
| 77 |
+
return GROUP_8141_COMM_APPROVED_CAP_PRE_2024[ds_key]
|
| 78 |
+
elif fill_date <= datetime(2024, 3, 31):
|
| 79 |
+
return GROUP_8141_COMM_APPROVED_CAP_ELEVATED[ds_key]
|
| 80 |
+
else:
|
| 81 |
+
return GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024[ds_key]
|
| 82 |
+
elif group_id == "8200":
|
| 83 |
+
if scenario == "commercial_approved":
|
| 84 |
+
return GROUP_8200_COMM_APPROVED_CAP[ds_key]
|
| 85 |
+
return 0
|
| 86 |
+
|
| 87 |
|
| 88 |
def generate_patient_profiles(n):
|
| 89 |
"""Generate patient-level profiles with fill count distributions."""
|
| 90 |
+
fill_counts = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
|
| 91 |
+
size=n, p=[0.10, 0.15, 0.20, 0.20, 0.12, 0.08, 0.05, 0.03, 0.02, 0.02, 0.015, 0.015])
|
|
|
|
|
|
|
| 92 |
fraud_one_done = np.random.random(n) < 0.05
|
| 93 |
fill_counts = np.where(fraud_one_done, 1, fill_counts)
|
| 94 |
|
|
|
|
| 105 |
def generate_hcp_profiles(n):
|
| 106 |
"""Generate HCP profiles with fraud concentration."""
|
| 107 |
specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
|
|
|
|
| 108 |
fraud_hcp = np.random.random(n) < 0.10
|
| 109 |
hcps = pd.DataFrame({
|
| 110 |
"hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
|
| 111 |
"hcp_specialty": specialties,
|
| 112 |
"is_fraud_hcp": fraud_hcp,
|
| 113 |
})
|
|
|
|
| 114 |
hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
|
| 115 |
size=hcps["is_fraud_hcp"].sum())
|
| 116 |
return hcps
|
|
|
|
| 120 |
"""Generate pharmacy profiles with fraud concentration."""
|
| 121 |
subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
|
| 122 |
states = np.random.choice(STATES, size=n)
|
|
|
|
| 123 |
fraud_pharm = np.random.random(n) < 0.08
|
| 124 |
pharm = pd.DataFrame({
|
| 125 |
"pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
|
| 135 |
+
"""Generate transaction-level claims with embedded fraud patterns including group-aware fraud."""
|
| 136 |
records = []
|
| 137 |
claim_counter = 0
|
| 138 |
|
| 139 |
# Assign patients to HCPs (with concentration for fraud HCPs)
|
|
|
|
| 140 |
for _, pat in patient_profiles.iterrows():
|
| 141 |
n_fills = pat["expected_fills"]
|
| 142 |
pat_id = pat["patient_id"]
|
|
|
|
| 145 |
|
| 146 |
# Choose HCP(s)
|
| 147 |
if pat_fraud:
|
|
|
|
| 148 |
fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
|
| 149 |
if len(fraud_hcps) > 0 and np.random.random() < 0.80:
|
| 150 |
hcp = fraud_hcps.sample(1).iloc[0]
|
| 151 |
else:
|
| 152 |
hcp = hcp_profiles.sample(1).iloc[0]
|
| 153 |
else:
|
| 154 |
+
hcp = hcp_profiles.sample(1).iloc[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
hcp_id = hcp["hcp_id"]
|
| 157 |
hcp_spec = hcp["hcp_specialty"]
|
|
|
|
| 159 |
|
| 160 |
# Choose pharmacy
|
| 161 |
if pat_fraud or hcp_fraud:
|
|
|
|
| 162 |
fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
|
| 163 |
if len(fraud_pharm) > 0 and np.random.random() < 0.50:
|
| 164 |
pharm = fraud_pharm.sample(1).iloc[0]
|
| 165 |
else:
|
| 166 |
pharm = pharmacy_profiles.sample(1).iloc[0]
|
| 167 |
else:
|
|
|
|
| 168 |
normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
|
| 169 |
if len(normal_pharm) > 0:
|
| 170 |
pharm = normal_pharm.sample(1).iloc[0]
|
| 171 |
else:
|
| 172 |
+
pharm = pharmacy_profiles.sample(1).iloc[1]
|
| 173 |
|
| 174 |
pharm_id = pharm["pharmacy_id"]
|
| 175 |
pharm_state = pharm["pharmacy_state"]
|
|
|
|
| 179 |
# Cross-state indicator
|
| 180 |
cross_state = (pat_state != pharm_state)
|
| 181 |
|
| 182 |
+
# Group assignment: ~60% Group 8141, ~40% Group 8200
|
| 183 |
+
group_id = np.random.choice(["8141", "8200"], p=[0.60, 0.40])
|
| 184 |
+
|
| 185 |
# Generate fill dates
|
|
|
|
|
|
|
| 186 |
if n_fills == 1:
|
| 187 |
base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
|
| 188 |
fill_dates = [base_date]
|
|
|
|
| 190 |
base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
|
| 191 |
fill_dates = [base_date]
|
| 192 |
for i in range(1, n_fills):
|
|
|
|
| 193 |
if pat_fraud and np.random.random() < 0.6:
|
| 194 |
gap = np.random.randint(5, 16)
|
| 195 |
else:
|
|
|
|
| 202 |
for idx, fill_date in enumerate(fill_dates):
|
| 203 |
claim_counter += 1
|
| 204 |
|
| 205 |
+
# Government insurance flag
|
| 206 |
if pat_fraud or hcp_fraud:
|
| 207 |
govt_prob = 0.25
|
| 208 |
else:
|
|
|
|
| 210 |
is_govt = np.random.random() < govt_prob
|
| 211 |
plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
|
| 212 |
|
| 213 |
+
# Determine claim scenario
|
| 214 |
+
if is_govt:
|
| 215 |
+
# Government claims typically have some form of rejection or special handling
|
| 216 |
+
if np.random.random() < 0.3:
|
| 217 |
+
scenario = "commercial_rejected"
|
| 218 |
+
reject_code = np.random.choice(["76", "88", "79"])
|
| 219 |
+
else:
|
| 220 |
+
scenario = "commercial_approved"
|
| 221 |
+
reject_code = None
|
| 222 |
+
else:
|
| 223 |
+
# Normal commercial distribution
|
| 224 |
+
if np.random.random() < 0.08:
|
| 225 |
+
scenario = "cash"
|
| 226 |
+
reject_code = None
|
| 227 |
+
elif np.random.random() < 0.06:
|
| 228 |
+
scenario = "commercial_rejected"
|
| 229 |
+
reject_code = np.random.choice(["75", "77", "80"])
|
| 230 |
+
else:
|
| 231 |
+
scenario = "commercial_approved"
|
| 232 |
+
reject_code = None
|
| 233 |
+
|
| 234 |
# NDC
|
| 235 |
ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
|
| 236 |
# Fraud: occasionally switch NDC for fraud patients
|
| 237 |
if pat_fraud and idx > 0 and np.random.random() < 0.3:
|
| 238 |
ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
|
| 239 |
|
| 240 |
+
# Non-covered NDC fraud: ~3% chance for fraud patients
|
| 241 |
+
if pat_fraud and np.random.random() < 0.15:
|
| 242 |
+
ndc = np.random.choice(NON_COVERED_NDCS)
|
| 243 |
+
|
| 244 |
+
# Quantity
|
| 245 |
qty = 1
|
| 246 |
if pat_fraud and np.random.random() < 0.2:
|
| 247 |
qty = 2
|
| 248 |
+
# Out of range fraud
|
| 249 |
+
if np.random.random() < 0.01:
|
| 250 |
+
qty = np.random.choice([0, 4, 5])
|
| 251 |
|
| 252 |
+
# Days supply
|
| 253 |
ds = 30
|
| 254 |
if pat_fraud and np.random.random() < 0.15:
|
| 255 |
ds = 90
|
| 256 |
+
# Out of range fraud
|
| 257 |
+
if np.random.random() < 0.01:
|
| 258 |
+
ds = np.random.choice([0, 95, 100])
|
| 259 |
+
|
| 260 |
+
# Calculate proper cap
|
| 261 |
+
cap = get_cap_for_claim(group_id, scenario, fill_date, ds)
|
| 262 |
+
|
| 263 |
+
# Financials — base on cap with some variation
|
| 264 |
+
if cap > 0:
|
| 265 |
+
# Normal: benefit is 70-100% of cap
|
| 266 |
+
benefit = cap * np.random.uniform(0.70, 1.0)
|
| 267 |
+
# Fraud: sometimes exceed cap by 10-50%
|
| 268 |
+
if pat_fraud and np.random.random() < 0.3:
|
| 269 |
+
benefit = cap * np.random.uniform(1.10, 1.50)
|
| 270 |
+
# Group 8141 Jan-Mar elevated benefit fraud: use $500 cap outside that period
|
| 271 |
+
if group_id == "8141" and scenario == "commercial_approved" and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
|
| 272 |
+
if np.random.random() < 0.1: # 10% chance of wrong-period fraud
|
| 273 |
+
elevated_cap = GROUP_8141_COMM_APPROVED_CAP_ELEVATED[30 if ds <= 30 else (60 if ds <= 60 else 90)]
|
| 274 |
+
benefit = elevated_cap * np.random.uniform(0.8, 1.0)
|
| 275 |
+
else:
|
| 276 |
+
# Non-covered scenario (e.g., cash under Group 8200)
|
| 277 |
+
benefit = np.random.uniform(50, 300) # Still paid (fraud!)
|
| 278 |
|
|
|
|
| 279 |
usual_customary = np.random.uniform(500, 800)
|
| 280 |
copay_before = usual_customary
|
|
|
|
| 281 |
copay_after = max(0, copay_before - benefit)
|
| 282 |
+
|
| 283 |
+
# HCP fraud: higher benefit
|
| 284 |
if hcp_fraud:
|
| 285 |
benefit += np.random.uniform(50, 200)
|
| 286 |
copay_after = max(0, copay_before - benefit)
|
| 287 |
|
| 288 |
+
# Paper submission
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
paper_sub = 0
|
| 290 |
if pat_fraud and np.random.random() < 0.3:
|
| 291 |
paper_sub = 1
|
|
|
|
| 296 |
# DAW code
|
| 297 |
daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
|
| 298 |
|
| 299 |
+
# Claim type
|
| 300 |
claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
|
| 301 |
if pat_fraud and np.random.random() < 0.1:
|
| 302 |
+
claim_type = "A"
|
| 303 |
|
| 304 |
# Card ID
|
| 305 |
card_id = f"EL_C{np.random.randint(100000, 999999)}"
|
|
|
|
| 307 |
# RX number
|
| 308 |
rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
|
| 309 |
|
| 310 |
+
# Date written
|
| 311 |
date_written = fill_date - timedelta(days=np.random.randint(0, 6))
|
| 312 |
|
| 313 |
# BIN/PCN
|
| 314 |
bin_num = f"{np.random.randint(100000, 999999)}"
|
| 315 |
pcn = f"PCN{np.random.randint(10, 99)}"
|
| 316 |
|
| 317 |
+
# Payer status derived from scenario
|
| 318 |
+
payer_status_map = {
|
| 319 |
+
"cash": "CASH",
|
| 320 |
+
"commercial_approved": "APPROVED",
|
| 321 |
+
"commercial_rejected": "REJECTED",
|
| 322 |
+
}
|
| 323 |
+
payer_status = payer_status_map.get(scenario, "APPROVED")
|
| 324 |
|
| 325 |
+
# Is this claim fraudulent? (ground truth)
|
| 326 |
is_fraud = 0
|
| 327 |
fraud_type = ""
|
| 328 |
+
|
| 329 |
+
# Check group-aware fraud conditions
|
| 330 |
+
# Group 8200 non-covered scenario
|
| 331 |
+
if group_id == "8200" and scenario in ["cash", "commercial_rejected"]:
|
| 332 |
is_fraud = 1
|
| 333 |
+
fraud_type = "group_8200_non_covered_scenario"
|
| 334 |
+
# Benefit exceeds cap
|
| 335 |
+
elif cap > 0 and benefit > cap:
|
| 336 |
is_fraud = 1
|
| 337 |
+
fraud_type = "benefit_cap_exceeded"
|
| 338 |
+
# Group 8141 wrong-period elevated benefit
|
| 339 |
+
elif group_id == "8141" and scenario == "commercial_approved" and benefit > get_cap_for_claim("8141", "commercial_approved", fill_date, ds) and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
|
| 340 |
+
is_fraud = 1
|
| 341 |
+
fraud_type = "group_8141_wrong_period_benefit"
|
| 342 |
+
# Government insurance
|
| 343 |
elif is_govt:
|
| 344 |
is_fraud = 1
|
| 345 |
fraud_type = "govt_insurance"
|
| 346 |
+
# Non-covered NDC
|
| 347 |
+
elif ndc in NON_COVERED_NDCS:
|
| 348 |
+
is_fraud = 1
|
| 349 |
+
fraud_type = "non_covered_ndc"
|
| 350 |
+
# Reject fraud
|
| 351 |
elif reject_code in ["76", "88"]:
|
| 352 |
is_fraud = 1
|
| 353 |
fraud_type = "reject_fraud"
|
| 354 |
+
# Cross-state
|
| 355 |
elif cross_state and np.random.random() < 0.3:
|
| 356 |
is_fraud = 1
|
| 357 |
fraud_type = "cross_state"
|
| 358 |
+
# Quantity/days supply anomaly
|
| 359 |
elif qty != 1 or ds != 30:
|
| 360 |
is_fraud = 1
|
| 361 |
fraud_type = "quantity_anomaly"
|
| 362 |
+
# HCP fraud
|
| 363 |
+
elif hcp_fraud and (hcp_spec in SUSPICIOUS_SPECIALTIES):
|
| 364 |
+
is_fraud = 1
|
| 365 |
+
fraud_type = "hcp_fraud"
|
| 366 |
+
# Patient fraud
|
| 367 |
+
elif pat_fraud:
|
| 368 |
+
is_fraud = 1
|
| 369 |
+
fraud_type = "patient_fraud"
|
| 370 |
|
| 371 |
record = {
|
| 372 |
# Patient
|
|
|
|
| 412 |
"SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
|
| 413 |
"SUBMISSION_TYPE": "NEW",
|
| 414 |
"PAYMENT_METHOD": "ACH",
|
| 415 |
+
# Payer status
|
| 416 |
+
"PAYER_STATUS": payer_status,
|
| 417 |
+
"CLAIM_SCENARIO": scenario,
|
| 418 |
# Program
|
| 419 |
+
"GROUP_ID": group_id,
|
| 420 |
# Metadata
|
| 421 |
"FILE_NAME": "elaad_test_data_2024.txt",
|
| 422 |
# Ground truth
|
|
|
|
| 434 |
def main():
|
| 435 |
os.makedirs("data", exist_ok=True)
|
| 436 |
|
| 437 |
+
print("Generating ELAAD-style synthetic test data (v4 Group-Aware)...")
|
| 438 |
print(f" Patients: {N_PATIENTS}")
|
| 439 |
print(f" HCPs: {N_HCPS}")
|
| 440 |
print(f" Pharmacies: {N_PHARMACIES}")
|
|
|
|
| 448 |
# Summary
|
| 449 |
fraud_rate = df["is_fraud"].mean()
|
| 450 |
fraud_types = df["fraud_type"].value_counts().to_dict()
|
| 451 |
+
group_counts = df["GROUP_ID"].value_counts().to_dict()
|
| 452 |
+
scenario_counts = df["CLAIM_SCENARIO"].value_counts().to_dict()
|
| 453 |
print(f"\nGenerated {len(df)} claims")
|
| 454 |
print(f" Fraud rate: {fraud_rate:.2%}")
|
| 455 |
print(f" Fraud types: {fraud_types}")
|
| 456 |
+
print(f" Groups: {group_counts}")
|
| 457 |
+
print(f" Scenarios: {scenario_counts}")
|
| 458 |
print(f" Patients: {df['PATIENT_ID'].nunique()}")
|
| 459 |
print(f" HCPs: {df['HCP_ID'].nunique()}")
|
| 460 |
print(f" Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")
|