Harsh2396 commited on
Commit
ca13beb
·
verified ·
1 Parent(s): c093a23

Upload generate_elaad_test_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. generate_elaad_test_data.py +155 -51
generate_elaad_test_data.py CHANGED
@@ -2,12 +2,22 @@
2
  generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
3
  Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
4
 
5
- FRAUD PATTERNS EMBEDDED
6
- =======================
7
  1. HCP level: Suspicious HCP with high one-and-done patient concentration
8
  2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
9
  3. Patient level: Patients with very short gap bursts or one-and-done patterns
10
  4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
 
 
 
 
 
 
 
 
 
 
11
  """
12
 
13
  import os
@@ -23,9 +33,11 @@ N_HCPS = 40
23
  N_PHARMACIES = 25
24
  N_CLAIMS = 5000
25
 
26
- # Trelegy NDCs
27
- NDC_LIST = ["00173089314", "00173088714"]
28
  NDC_PROB = [0.85, 0.15]
 
 
29
 
30
  # Valid/suspicious specialties
31
  VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
@@ -45,13 +57,38 @@ PLAN_PROBS = [0.20, 0.18, 0.15, 0.12, 0.10] + [0.05, 0.08, 0.04, 0.03, 0.05]
45
  # Pharmacy subcategories
46
  PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def generate_patient_profiles(n):
50
  """Generate patient-level profiles with fill count distributions."""
51
- # Most patients have 2-8 fills; some are one-and-done (fraud signal)
52
- fill_counts = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 10, 12],
53
- size=n, p=[0.10, 0.15, 0.20, 0.20, 0.15, 0.08, 0.05, 0.03, 0.02, 0.02])
54
- # Embed fraud: ~5% patients are "one-and-done" (fraudulent singletons with suspicious patterns)
55
  fraud_one_done = np.random.random(n) < 0.05
56
  fill_counts = np.where(fraud_one_done, 1, fill_counts)
57
 
@@ -68,14 +105,12 @@ def generate_patient_profiles(n):
68
  def generate_hcp_profiles(n):
69
  """Generate HCP profiles with fraud concentration."""
70
  specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
71
- # Embed fraud: ~10% of HCPs are suspicious (high one-and-done concentration)
72
  fraud_hcp = np.random.random(n) < 0.10
73
  hcps = pd.DataFrame({
74
  "hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
75
  "hcp_specialty": specialties,
76
  "is_fraud_hcp": fraud_hcp,
77
  })
78
- # For fraud HCPs, override to suspicious specialty
79
  hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
80
  size=hcps["is_fraud_hcp"].sum())
81
  return hcps
@@ -85,7 +120,6 @@ def generate_pharmacy_profiles(n):
85
  """Generate pharmacy profiles with fraud concentration."""
86
  subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
87
  states = np.random.choice(STATES, size=n)
88
- # Embed fraud: ~8% of pharmacies are suspicious (high HCP concentration)
89
  fraud_pharm = np.random.random(n) < 0.08
90
  pharm = pd.DataFrame({
91
  "pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
@@ -98,12 +132,11 @@ def generate_pharmacy_profiles(n):
98
 
99
 
100
  def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
101
- """Generate transaction-level claims with embedded fraud patterns."""
102
  records = []
103
  claim_counter = 0
104
 
105
  # Assign patients to HCPs (with concentration for fraud HCPs)
106
- # Normal: patients see 1-2 HCPs. Fraud HCPs attract more patients with single fills.
107
  for _, pat in patient_profiles.iterrows():
108
  n_fills = pat["expected_fills"]
109
  pat_id = pat["patient_id"]
@@ -112,18 +145,13 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
112
 
113
  # Choose HCP(s)
114
  if pat_fraud:
115
- # Fraud patient: 80% chance assigned to a fraud HCP
116
  fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
117
  if len(fraud_hcps) > 0 and np.random.random() < 0.80:
118
  hcp = fraud_hcps.sample(1).iloc[0]
119
  else:
120
  hcp = hcp_profiles.sample(1).iloc[0]
121
  else:
122
- # Normal patient: 90% stays with same HCP, 10% switches
123
- if n_fills == 1:
124
- hcp = hcp_profiles.sample(1).iloc[0]
125
- else:
126
- hcp = hcp_profiles.sample(1).iloc[0]
127
 
128
  hcp_id = hcp["hcp_id"]
129
  hcp_spec = hcp["hcp_specialty"]
@@ -131,19 +159,17 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
131
 
132
  # Choose pharmacy
133
  if pat_fraud or hcp_fraud:
134
- # Fraud patterns: ~50% chance go to fraud pharmacy
135
  fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
136
  if len(fraud_pharm) > 0 and np.random.random() < 0.50:
137
  pharm = fraud_pharm.sample(1).iloc[0]
138
  else:
139
  pharm = pharmacy_profiles.sample(1).iloc[0]
140
  else:
141
- # Normal: mostly retail/chain
142
  normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
143
  if len(normal_pharm) > 0:
144
  pharm = normal_pharm.sample(1).iloc[0]
145
  else:
146
- pharm = pharmacy_profiles.sample(1).iloc[0]
147
 
148
  pharm_id = pharm["pharmacy_id"]
149
  pharm_state = pharm["pharmacy_state"]
@@ -153,9 +179,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
153
  # Cross-state indicator
154
  cross_state = (pat_state != pharm_state)
155
 
 
 
 
156
  # Generate fill dates
157
- # Normal: 30-day intervals with some randomness
158
- # Fraud: very short gaps or single fill
159
  if n_fills == 1:
160
  base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
161
  fill_dates = [base_date]
@@ -163,7 +190,6 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
163
  base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
164
  fill_dates = [base_date]
165
  for i in range(1, n_fills):
166
- # Normal gap: 25-35 days. Fraud gap: 5-15 days (early refill)
167
  if pat_fraud and np.random.random() < 0.6:
168
  gap = np.random.randint(5, 16)
169
  else:
@@ -176,7 +202,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
176
  for idx, fill_date in enumerate(fill_dates):
177
  claim_counter += 1
178
 
179
- # Government insurance flag (fraud: higher chance for fraud patients/HCPs)
180
  if pat_fraud or hcp_fraud:
181
  govt_prob = 0.25
182
  else:
@@ -184,40 +210,82 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
184
  is_govt = np.random.random() < govt_prob
185
  plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # NDC
188
  ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
189
  # Fraud: occasionally switch NDC for fraud patients
190
  if pat_fraud and idx > 0 and np.random.random() < 0.3:
191
  ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
192
 
193
- # Quantity: mostly 1, fraud patients sometimes get 2
 
 
 
 
194
  qty = 1
195
  if pat_fraud and np.random.random() < 0.2:
196
  qty = 2
 
 
 
197
 
198
- # Days supply: mostly 30, fraud patients sometimes get 90
199
  ds = 30
200
  if pat_fraud and np.random.random() < 0.15:
201
  ds = 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- # Financials
204
  usual_customary = np.random.uniform(500, 800)
205
  copay_before = usual_customary
206
- benefit = np.random.uniform(450, 650)
207
  copay_after = max(0, copay_before - benefit)
208
- # Fraud HCPs may have higher average benefit
 
209
  if hcp_fraud:
210
  benefit += np.random.uniform(50, 200)
211
  copay_after = max(0, copay_before - benefit)
212
 
213
- # Reject codes
214
- reject_code = None
215
- if np.random.random() < 0.04:
216
- reject_code = np.random.choice(["76", "88", "79"])
217
- if pat_fraud and np.random.random() < 0.15:
218
- reject_code = np.random.choice(["76", "88", "79"])
219
-
220
- # Paper submission (fraud: higher for fraud patients)
221
  paper_sub = 0
222
  if pat_fraud and np.random.random() < 0.3:
223
  paper_sub = 1
@@ -228,10 +296,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
228
  # DAW code
229
  daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
230
 
231
- # Claim type: Normal (N), Reversal (R), Adjustment (A)
232
  claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
233
  if pat_fraud and np.random.random() < 0.1:
234
- claim_type = "A" # Adjustments more common for fraud
235
 
236
  # Card ID
237
  card_id = f"EL_C{np.random.randint(100000, 999999)}"
@@ -239,37 +307,66 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
239
  # RX number
240
  rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
241
 
242
- # Date written (typically 0-5 days before fill)
243
  date_written = fill_date - timedelta(days=np.random.randint(0, 6))
244
 
245
  # BIN/PCN
246
  bin_num = f"{np.random.randint(100000, 999999)}"
247
  pcn = f"PCN{np.random.randint(10, 99)}"
248
 
249
- # Group number
250
- group_num = f"GRP{np.random.randint(1000, 9999)}"
 
 
 
 
 
251
 
252
- # Is this claim fraudulent?
253
  is_fraud = 0
254
  fraud_type = ""
255
- if pat_fraud:
 
 
 
256
  is_fraud = 1
257
- fraud_type = "patient_fraud"
258
- elif hcp_fraud and (hcp["hcp_specialty"] in SUSPICIOUS_SPECIALTIES):
 
259
  is_fraud = 1
260
- fraud_type = "hcp_fraud"
 
 
 
 
 
261
  elif is_govt:
262
  is_fraud = 1
263
  fraud_type = "govt_insurance"
 
 
 
 
 
264
  elif reject_code in ["76", "88"]:
265
  is_fraud = 1
266
  fraud_type = "reject_fraud"
 
267
  elif cross_state and np.random.random() < 0.3:
268
  is_fraud = 1
269
  fraud_type = "cross_state"
 
270
  elif qty != 1 or ds != 30:
271
  is_fraud = 1
272
  fraud_type = "quantity_anomaly"
 
 
 
 
 
 
 
 
273
 
274
  record = {
275
  # Patient
@@ -315,8 +412,11 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
315
  "SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
316
  "SUBMISSION_TYPE": "NEW",
317
  "PAYMENT_METHOD": "ACH",
 
 
 
318
  # Program
319
- "GROUP_NUMBER": group_num,
320
  # Metadata
321
  "FILE_NAME": "elaad_test_data_2024.txt",
322
  # Ground truth
@@ -334,7 +434,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
334
  def main():
335
  os.makedirs("data", exist_ok=True)
336
 
337
- print("Generating ELAAD-style synthetic test data...")
338
  print(f" Patients: {N_PATIENTS}")
339
  print(f" HCPs: {N_HCPS}")
340
  print(f" Pharmacies: {N_PHARMACIES}")
@@ -348,9 +448,13 @@ def main():
348
  # Summary
349
  fraud_rate = df["is_fraud"].mean()
350
  fraud_types = df["fraud_type"].value_counts().to_dict()
 
 
351
  print(f"\nGenerated {len(df)} claims")
352
  print(f" Fraud rate: {fraud_rate:.2%}")
353
  print(f" Fraud types: {fraud_types}")
 
 
354
  print(f" Patients: {df['PATIENT_ID'].nunique()}")
355
  print(f" HCPs: {df['HCP_ID'].nunique()}")
356
  print(f" Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")
 
2
  generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
3
  Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
4
 
5
+ FRAUD PATTERNS EMBEDDED (v4 Group-Aware)
6
+ ========================================
7
  1. HCP level: Suspicious HCP with high one-and-done patient concentration
8
  2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
9
  3. Patient level: Patients with very short gap bursts or one-and-done patterns
10
  4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
11
+ 5. GROUP-AWARE FRAUD (NEW):
12
+ - Group 8200 cash claims (not covered in 2025 design)
13
+ - Group 8200 commercial rejected claims (not covered in 2025 design)
14
+ - Group 8141 claims with benefit > cap for scenario+days supply
15
+ - Group 8141 using $500/30d cap outside Jan-Mar 2024
16
+ - Annual fill count > 12 per patient per year
17
+ - Annual days supply > 360 per patient per year
18
+ - Non-covered NDCs (not 00173088710 or 00173089310)
19
+ - Government insurance claims receiving GSK benefit
20
+ - Max benefit repeat (patient+pharmacy hitting cap >=3 times)
21
  """
22
 
23
  import os
 
33
  N_PHARMACIES = 25
34
  N_CLAIMS = 5000
35
 
36
+ # Trelegy NDCs (canonical 11-digit)
37
+ NDC_LIST = ["00173089310", "00173088710"]
38
  NDC_PROB = [0.85, 0.15]
39
+ # Non-covered NDC (for fraud embedding)
40
+ NON_COVERED_NDCS = ["00173089999", "00173088888"]
41
 
42
  # Valid/suspicious specialties
43
  VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
 
57
  # Pharmacy subcategories
58
  PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
59
 
60
+ # Benefit caps for generating realistic benefit amounts
61
+ GROUP_8141_CASH_CAP = {30: 100, 60: 200, 90: 300}
62
+ GROUP_8141_COMM_APPROVED_CAP_PRE_2024 = {30: 200, 60: 400, 90: 600}
63
+ GROUP_8141_COMM_APPROVED_CAP_ELEVATED = {30: 500, 60: 1000, 90: 1500} # Jan-Mar 2024
64
+ GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024 = {30: 200, 60: 400, 90: 600}
65
+ GROUP_8141_COMM_REJECTED_CAP = {30: 100, 60: 200, 90: 300}
66
+ GROUP_8200_COMM_APPROVED_CAP = {30: 645, 60: 1290, 90: 1935}
67
+
68
+
69
+ def get_cap_for_claim(group_id, scenario, fill_date, days_supply):
70
+ """Get the benefit cap for a given group/scenario/date/ds combination."""
71
+ ds_key = 30 if days_supply <= 30 else (60 if days_supply <= 60 else 90)
72
+ if group_id == "8141":
73
+ if scenario in ["cash", "commercial_rejected"]:
74
+ return GROUP_8141_CASH_CAP[ds_key]
75
+ if scenario == "commercial_approved":
76
+ if fill_date < datetime(2024, 1, 1):
77
+ return GROUP_8141_COMM_APPROVED_CAP_PRE_2024[ds_key]
78
+ elif fill_date <= datetime(2024, 3, 31):
79
+ return GROUP_8141_COMM_APPROVED_CAP_ELEVATED[ds_key]
80
+ else:
81
+ return GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024[ds_key]
82
+ elif group_id == "8200":
83
+ if scenario == "commercial_approved":
84
+ return GROUP_8200_COMM_APPROVED_CAP[ds_key]
85
+ return 0
86
+
87
 
88
  def generate_patient_profiles(n):
89
  """Generate patient-level profiles with fill count distributions."""
90
+ fill_counts = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
91
+ size=n, p=[0.10, 0.15, 0.20, 0.20, 0.12, 0.08, 0.05, 0.03, 0.02, 0.02, 0.015, 0.015])
 
 
92
  fraud_one_done = np.random.random(n) < 0.05
93
  fill_counts = np.where(fraud_one_done, 1, fill_counts)
94
 
 
105
  def generate_hcp_profiles(n):
106
  """Generate HCP profiles with fraud concentration."""
107
  specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
 
108
  fraud_hcp = np.random.random(n) < 0.10
109
  hcps = pd.DataFrame({
110
  "hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
111
  "hcp_specialty": specialties,
112
  "is_fraud_hcp": fraud_hcp,
113
  })
 
114
  hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
115
  size=hcps["is_fraud_hcp"].sum())
116
  return hcps
 
120
  """Generate pharmacy profiles with fraud concentration."""
121
  subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
122
  states = np.random.choice(STATES, size=n)
 
123
  fraud_pharm = np.random.random(n) < 0.08
124
  pharm = pd.DataFrame({
125
  "pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
 
132
 
133
 
134
  def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
135
+ """Generate transaction-level claims with embedded fraud patterns including group-aware fraud."""
136
  records = []
137
  claim_counter = 0
138
 
139
  # Assign patients to HCPs (with concentration for fraud HCPs)
 
140
  for _, pat in patient_profiles.iterrows():
141
  n_fills = pat["expected_fills"]
142
  pat_id = pat["patient_id"]
 
145
 
146
  # Choose HCP(s)
147
  if pat_fraud:
 
148
  fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
149
  if len(fraud_hcps) > 0 and np.random.random() < 0.80:
150
  hcp = fraud_hcps.sample(1).iloc[0]
151
  else:
152
  hcp = hcp_profiles.sample(1).iloc[0]
153
  else:
154
+ hcp = hcp_profiles.sample(1).iloc[0]
 
 
 
 
155
 
156
  hcp_id = hcp["hcp_id"]
157
  hcp_spec = hcp["hcp_specialty"]
 
159
 
160
  # Choose pharmacy
161
  if pat_fraud or hcp_fraud:
 
162
  fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
163
  if len(fraud_pharm) > 0 and np.random.random() < 0.50:
164
  pharm = fraud_pharm.sample(1).iloc[0]
165
  else:
166
  pharm = pharmacy_profiles.sample(1).iloc[0]
167
  else:
 
168
  normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
169
  if len(normal_pharm) > 0:
170
  pharm = normal_pharm.sample(1).iloc[0]
171
  else:
172
+ pharm = pharmacy_profiles.sample(1).iloc[1]
173
 
174
  pharm_id = pharm["pharmacy_id"]
175
  pharm_state = pharm["pharmacy_state"]
 
179
  # Cross-state indicator
180
  cross_state = (pat_state != pharm_state)
181
 
182
+ # Group assignment: ~60% Group 8141, ~40% Group 8200
183
+ group_id = np.random.choice(["8141", "8200"], p=[0.60, 0.40])
184
+
185
  # Generate fill dates
 
 
186
  if n_fills == 1:
187
  base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
188
  fill_dates = [base_date]
 
190
  base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
191
  fill_dates = [base_date]
192
  for i in range(1, n_fills):
 
193
  if pat_fraud and np.random.random() < 0.6:
194
  gap = np.random.randint(5, 16)
195
  else:
 
202
  for idx, fill_date in enumerate(fill_dates):
203
  claim_counter += 1
204
 
205
+ # Government insurance flag
206
  if pat_fraud or hcp_fraud:
207
  govt_prob = 0.25
208
  else:
 
210
  is_govt = np.random.random() < govt_prob
211
  plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
212
 
213
+ # Determine claim scenario
214
+ if is_govt:
215
+ # Government claims typically have some form of rejection or special handling
216
+ if np.random.random() < 0.3:
217
+ scenario = "commercial_rejected"
218
+ reject_code = np.random.choice(["76", "88", "79"])
219
+ else:
220
+ scenario = "commercial_approved"
221
+ reject_code = None
222
+ else:
223
+ # Normal commercial distribution
224
+ if np.random.random() < 0.08:
225
+ scenario = "cash"
226
+ reject_code = None
227
+ elif np.random.random() < 0.06:
228
+ scenario = "commercial_rejected"
229
+ reject_code = np.random.choice(["75", "77", "80"])
230
+ else:
231
+ scenario = "commercial_approved"
232
+ reject_code = None
233
+
234
  # NDC
235
  ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
236
  # Fraud: occasionally switch NDC for fraud patients
237
  if pat_fraud and idx > 0 and np.random.random() < 0.3:
238
  ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
239
 
240
+ # Non-covered NDC fraud: ~3% chance for fraud patients
241
+ if pat_fraud and np.random.random() < 0.15:
242
+ ndc = np.random.choice(NON_COVERED_NDCS)
243
+
244
+ # Quantity
245
  qty = 1
246
  if pat_fraud and np.random.random() < 0.2:
247
  qty = 2
248
+ # Out of range fraud
249
+ if np.random.random() < 0.01:
250
+ qty = np.random.choice([0, 4, 5])
251
 
252
+ # Days supply
253
  ds = 30
254
  if pat_fraud and np.random.random() < 0.15:
255
  ds = 90
256
+ # Out of range fraud
257
+ if np.random.random() < 0.01:
258
+ ds = np.random.choice([0, 95, 100])
259
+
260
+ # Calculate proper cap
261
+ cap = get_cap_for_claim(group_id, scenario, fill_date, ds)
262
+
263
+ # Financials — base on cap with some variation
264
+ if cap > 0:
265
+ # Normal: benefit is 70-100% of cap
266
+ benefit = cap * np.random.uniform(0.70, 1.0)
267
+ # Fraud: sometimes exceed cap by 10-50%
268
+ if pat_fraud and np.random.random() < 0.3:
269
+ benefit = cap * np.random.uniform(1.10, 1.50)
270
+ # Group 8141 Jan-Mar elevated benefit fraud: use $500 cap outside that period
271
+ if group_id == "8141" and scenario == "commercial_approved" and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
272
+ if np.random.random() < 0.1: # 10% chance of wrong-period fraud
273
+ elevated_cap = GROUP_8141_COMM_APPROVED_CAP_ELEVATED[30 if ds <= 30 else (60 if ds <= 60 else 90)]
274
+ benefit = elevated_cap * np.random.uniform(0.8, 1.0)
275
+ else:
276
+ # Non-covered scenario (e.g., cash under Group 8200)
277
+ benefit = np.random.uniform(50, 300) # Still paid (fraud!)
278
 
 
279
  usual_customary = np.random.uniform(500, 800)
280
  copay_before = usual_customary
 
281
  copay_after = max(0, copay_before - benefit)
282
+
283
+ # HCP fraud: higher benefit
284
  if hcp_fraud:
285
  benefit += np.random.uniform(50, 200)
286
  copay_after = max(0, copay_before - benefit)
287
 
288
+ # Paper submission
 
 
 
 
 
 
 
289
  paper_sub = 0
290
  if pat_fraud and np.random.random() < 0.3:
291
  paper_sub = 1
 
296
  # DAW code
297
  daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
298
 
299
+ # Claim type
300
  claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
301
  if pat_fraud and np.random.random() < 0.1:
302
+ claim_type = "A"
303
 
304
  # Card ID
305
  card_id = f"EL_C{np.random.randint(100000, 999999)}"
 
307
  # RX number
308
  rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
309
 
310
+ # Date written
311
  date_written = fill_date - timedelta(days=np.random.randint(0, 6))
312
 
313
  # BIN/PCN
314
  bin_num = f"{np.random.randint(100000, 999999)}"
315
  pcn = f"PCN{np.random.randint(10, 99)}"
316
 
317
+ # Payer status derived from scenario
318
+ payer_status_map = {
319
+ "cash": "CASH",
320
+ "commercial_approved": "APPROVED",
321
+ "commercial_rejected": "REJECTED",
322
+ }
323
+ payer_status = payer_status_map.get(scenario, "APPROVED")
324
 
325
+ # Is this claim fraudulent? (ground truth)
326
  is_fraud = 0
327
  fraud_type = ""
328
+
329
+ # Check group-aware fraud conditions
330
+ # Group 8200 non-covered scenario
331
+ if group_id == "8200" and scenario in ["cash", "commercial_rejected"]:
332
  is_fraud = 1
333
+ fraud_type = "group_8200_non_covered_scenario"
334
+ # Benefit exceeds cap
335
+ elif cap > 0 and benefit > cap:
336
  is_fraud = 1
337
+ fraud_type = "benefit_cap_exceeded"
338
+ # Group 8141 wrong-period elevated benefit
339
+ elif group_id == "8141" and scenario == "commercial_approved" and benefit > get_cap_for_claim("8141", "commercial_approved", fill_date, ds) and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
340
+ is_fraud = 1
341
+ fraud_type = "group_8141_wrong_period_benefit"
342
+ # Government insurance
343
  elif is_govt:
344
  is_fraud = 1
345
  fraud_type = "govt_insurance"
346
+ # Non-covered NDC
347
+ elif ndc in NON_COVERED_NDCS:
348
+ is_fraud = 1
349
+ fraud_type = "non_covered_ndc"
350
+ # Reject fraud
351
  elif reject_code in ["76", "88"]:
352
  is_fraud = 1
353
  fraud_type = "reject_fraud"
354
+ # Cross-state
355
  elif cross_state and np.random.random() < 0.3:
356
  is_fraud = 1
357
  fraud_type = "cross_state"
358
+ # Quantity/days supply anomaly
359
  elif qty != 1 or ds != 30:
360
  is_fraud = 1
361
  fraud_type = "quantity_anomaly"
362
+ # HCP fraud
363
+ elif hcp_fraud and (hcp_spec in SUSPICIOUS_SPECIALTIES):
364
+ is_fraud = 1
365
+ fraud_type = "hcp_fraud"
366
+ # Patient fraud
367
+ elif pat_fraud:
368
+ is_fraud = 1
369
+ fraud_type = "patient_fraud"
370
 
371
  record = {
372
  # Patient
 
412
  "SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
413
  "SUBMISSION_TYPE": "NEW",
414
  "PAYMENT_METHOD": "ACH",
415
+ # Payer status
416
+ "PAYER_STATUS": payer_status,
417
+ "CLAIM_SCENARIO": scenario,
418
  # Program
419
+ "GROUP_ID": group_id,
420
  # Metadata
421
  "FILE_NAME": "elaad_test_data_2024.txt",
422
  # Ground truth
 
434
  def main():
435
  os.makedirs("data", exist_ok=True)
436
 
437
+ print("Generating ELAAD-style synthetic test data (v4 Group-Aware)...")
438
  print(f" Patients: {N_PATIENTS}")
439
  print(f" HCPs: {N_HCPS}")
440
  print(f" Pharmacies: {N_PHARMACIES}")
 
448
  # Summary
449
  fraud_rate = df["is_fraud"].mean()
450
  fraud_types = df["fraud_type"].value_counts().to_dict()
451
+ group_counts = df["GROUP_ID"].value_counts().to_dict()
452
+ scenario_counts = df["CLAIM_SCENARIO"].value_counts().to_dict()
453
  print(f"\nGenerated {len(df)} claims")
454
  print(f" Fraud rate: {fraud_rate:.2%}")
455
  print(f" Fraud types: {fraud_types}")
456
+ print(f" Groups: {group_counts}")
457
+ print(f" Scenarios: {scenario_counts}")
458
  print(f" Patients: {df['PATIENT_ID'].nunique()}")
459
  print(f" HCPs: {df['HCP_ID'].nunique()}")
460
  print(f" Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")