Harsh2396
/

gsk-copay-fraud-detection

ml-intern

Model card Files Files and versions

xet

Community

Harsh2396 commited on 5 days ago

Commit

ca13beb

verified ·

1 Parent(s): c093a23

Upload generate_elaad_test_data.py with huggingface_hub

Browse files

Files changed (1) hide show

generate_elaad_test_data.py +155 -51

generate_elaad_test_data.py CHANGED Viewed

@@ -2,12 +2,22 @@
 generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
 Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
-FRAUD PATTERNS EMBEDDED
-=======================
 1. HCP level: Suspicious HCP with high one-and-done patient concentration
 2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
 3. Patient level: Patients with very short gap bursts or one-and-done patterns
 4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
 """
 import os
@@ -23,9 +33,11 @@ N_HCPS = 40
 N_PHARMACIES = 25
 N_CLAIMS = 5000
-# Trelegy NDCs
-NDC_LIST = ["00173089314", "00173088714"]
 NDC_PROB = [0.85, 0.15]
 # Valid/suspicious specialties
 VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
@@ -45,13 +57,38 @@ PLAN_PROBS = [0.20, 0.18, 0.15, 0.12, 0.10] + [0.05, 0.08, 0.04, 0.03, 0.05]
 # Pharmacy subcategories
 PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
 def generate_patient_profiles(n):
     """Generate patient-level profiles with fill count distributions."""
-    # Most patients have 2-8 fills; some are one-and-done (fraud signal)
-    fill_counts = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 10, 12],
-                                    size=n, p=[0.10, 0.15, 0.20, 0.20, 0.15, 0.08, 0.05, 0.03, 0.02, 0.02])
-    # Embed fraud: ~5% patients are "one-and-done" (fraudulent singletons with suspicious patterns)
     fraud_one_done = np.random.random(n) < 0.05
     fill_counts = np.where(fraud_one_done, 1, fill_counts)
@@ -68,14 +105,12 @@ def generate_patient_profiles(n):
 def generate_hcp_profiles(n):
     """Generate HCP profiles with fraud concentration."""
     specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
-    # Embed fraud: ~10% of HCPs are suspicious (high one-and-done concentration)
     fraud_hcp = np.random.random(n) < 0.10
     hcps = pd.DataFrame({
         "hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
         "hcp_specialty": specialties,
         "is_fraud_hcp": fraud_hcp,
     })
-    # For fraud HCPs, override to suspicious specialty
     hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
                                                                           size=hcps["is_fraud_hcp"].sum())
     return hcps
@@ -85,7 +120,6 @@ def generate_pharmacy_profiles(n):
     """Generate pharmacy profiles with fraud concentration."""
     subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
     states = np.random.choice(STATES, size=n)
-    # Embed fraud: ~8% of pharmacies are suspicious (high HCP concentration)
     fraud_pharm = np.random.random(n) < 0.08
     pharm = pd.DataFrame({
         "pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
@@ -98,12 +132,11 @@ def generate_pharmacy_profiles(n):
 def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
-    """Generate transaction-level claims with embedded fraud patterns."""
     records = []
     claim_counter = 0
     # Assign patients to HCPs (with concentration for fraud HCPs)
-    # Normal: patients see 1-2 HCPs. Fraud HCPs attract more patients with single fills.
     for _, pat in patient_profiles.iterrows():
         n_fills = pat["expected_fills"]
         pat_id = pat["patient_id"]
@@ -112,18 +145,13 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
         # Choose HCP(s)
         if pat_fraud:
-            # Fraud patient: 80% chance assigned to a fraud HCP
             fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
             if len(fraud_hcps) > 0 and np.random.random() < 0.80:
                 hcp = fraud_hcps.sample(1).iloc[0]
             else:
                 hcp = hcp_profiles.sample(1).iloc[0]
         else:
-            # Normal patient: 90% stays with same HCP, 10% switches
-            if n_fills == 1:
-                hcp = hcp_profiles.sample(1).iloc[0]
-            else:
-                hcp = hcp_profiles.sample(1).iloc[0]
         hcp_id = hcp["hcp_id"]
         hcp_spec = hcp["hcp_specialty"]
@@ -131,19 +159,17 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
         # Choose pharmacy
         if pat_fraud or hcp_fraud:
-            # Fraud patterns: ~50% chance go to fraud pharmacy
             fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
             if len(fraud_pharm) > 0 and np.random.random() < 0.50:
                 pharm = fraud_pharm.sample(1).iloc[0]
             else:
                 pharm = pharmacy_profiles.sample(1).iloc[0]
         else:
-            # Normal: mostly retail/chain
             normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
             if len(normal_pharm) > 0:
                 pharm = normal_pharm.sample(1).iloc[0]
             else:
-                pharm = pharmacy_profiles.sample(1).iloc[0]
         pharm_id = pharm["pharmacy_id"]
         pharm_state = pharm["pharmacy_state"]
@@ -153,9 +179,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
         # Cross-state indicator
         cross_state = (pat_state != pharm_state)
         # Generate fill dates
-        # Normal: 30-day intervals with some randomness
-        # Fraud: very short gaps or single fill
         if n_fills == 1:
             base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
             fill_dates = [base_date]
@@ -163,7 +190,6 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
             base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
             fill_dates = [base_date]
             for i in range(1, n_fills):
-                # Normal gap: 25-35 days. Fraud gap: 5-15 days (early refill)
                 if pat_fraud and np.random.random() < 0.6:
                     gap = np.random.randint(5, 16)
                 else:
@@ -176,7 +202,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
         for idx, fill_date in enumerate(fill_dates):
             claim_counter += 1
-            # Government insurance flag (fraud: higher chance for fraud patients/HCPs)
             if pat_fraud or hcp_fraud:
                 govt_prob = 0.25
             else:
@@ -184,40 +210,82 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
             is_govt = np.random.random() < govt_prob
             plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
             # NDC
             ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
             # Fraud: occasionally switch NDC for fraud patients
             if pat_fraud and idx > 0 and np.random.random() < 0.3:
                 ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
-            # Quantity: mostly 1, fraud patients sometimes get 2
             qty = 1
             if pat_fraud and np.random.random() < 0.2:
                 qty = 2
-            # Days supply: mostly 30, fraud patients sometimes get 90
             ds = 30
             if pat_fraud and np.random.random() < 0.15:
                 ds = 90
-            # Financials
             usual_customary = np.random.uniform(500, 800)
             copay_before = usual_customary
-            benefit = np.random.uniform(450, 650)
             copay_after = max(0, copay_before - benefit)
-            # Fraud HCPs may have higher average benefit
             if hcp_fraud:
                 benefit += np.random.uniform(50, 200)
                 copay_after = max(0, copay_before - benefit)
-            # Reject codes
-            reject_code = None
-            if np.random.random() < 0.04:
-                reject_code = np.random.choice(["76", "88", "79"])
-            if pat_fraud and np.random.random() < 0.15:
-                reject_code = np.random.choice(["76", "88", "79"])
-            # Paper submission (fraud: higher for fraud patients)
             paper_sub = 0
             if pat_fraud and np.random.random() < 0.3:
                 paper_sub = 1
@@ -228,10 +296,10 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
             # DAW code
             daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
-            # Claim type: Normal (N), Reversal (R), Adjustment (A)
             claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
             if pat_fraud and np.random.random() < 0.1:
-                claim_type = "A"  # Adjustments more common for fraud
             # Card ID
             card_id = f"EL_C{np.random.randint(100000, 999999)}"
@@ -239,37 +307,66 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
             # RX number
             rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
-            # Date written (typically 0-5 days before fill)
             date_written = fill_date - timedelta(days=np.random.randint(0, 6))
             # BIN/PCN
             bin_num = f"{np.random.randint(100000, 999999)}"
             pcn = f"PCN{np.random.randint(10, 99)}"
-            # Group number
-            group_num = f"GRP{np.random.randint(1000, 9999)}"
-            # Is this claim fraudulent?
             is_fraud = 0
             fraud_type = ""
-            if pat_fraud:
                 is_fraud = 1
-                fraud_type = "patient_fraud"
-            elif hcp_fraud and (hcp["hcp_specialty"] in SUSPICIOUS_SPECIALTIES):
                 is_fraud = 1
-                fraud_type = "hcp_fraud"
             elif is_govt:
                 is_fraud = 1
                 fraud_type = "govt_insurance"
             elif reject_code in ["76", "88"]:
                 is_fraud = 1
                 fraud_type = "reject_fraud"
             elif cross_state and np.random.random() < 0.3:
                 is_fraud = 1
                 fraud_type = "cross_state"
             elif qty != 1 or ds != 30:
                 is_fraud = 1
                 fraud_type = "quantity_anomaly"
             record = {
                 # Patient
@@ -315,8 +412,11 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
                 "SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
                 "SUBMISSION_TYPE": "NEW",
                 "PAYMENT_METHOD": "ACH",
                 # Program
-                "GROUP_NUMBER": group_num,
                 # Metadata
                 "FILE_NAME": "elaad_test_data_2024.txt",
                 # Ground truth
@@ -334,7 +434,7 @@ def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5
 def main():
     os.makedirs("data", exist_ok=True)
-    print("Generating ELAAD-style synthetic test data...")
     print(f"  Patients: {N_PATIENTS}")
     print(f"  HCPs: {N_HCPS}")
     print(f"  Pharmacies: {N_PHARMACIES}")
@@ -348,9 +448,13 @@ def main():
     # Summary
     fraud_rate = df["is_fraud"].mean()
     fraud_types = df["fraud_type"].value_counts().to_dict()
     print(f"\nGenerated {len(df)} claims")
     print(f"  Fraud rate: {fraud_rate:.2%}")
     print(f"  Fraud types: {fraud_types}")
     print(f"  Patients: {df['PATIENT_ID'].nunique()}")
     print(f"  HCPs: {df['HCP_ID'].nunique()}")
     print(f"  Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")

 generate_elaad_test_data.py — ELAAD/APLD-style synthetic copay data generator.
 Creates realistic Trelegy Ellipta claims with embedded fraud patterns.
+FRAUD PATTERNS EMBEDDED (v4 Group-Aware)
+========================================
 1. HCP level: Suspicious HCP with high one-and-done patient concentration
 2. Pharmacy level: Pharmacy with high HCP concentration + high one-and-done patients
 3. Patient level: Patients with very short gap bursts or one-and-done patterns
 4. Transaction level: Early refills, wrong quantity, govt insurance, NDC switches
+5. GROUP-AWARE FRAUD (NEW):
+   - Group 8200 cash claims (not covered in 2025 design)
+   - Group 8200 commercial rejected claims (not covered in 2025 design)
+   - Group 8141 claims with benefit > cap for scenario+days supply
+   - Group 8141 using $500/30d cap outside Jan-Mar 2024
+   - Annual fill count > 12 per patient per year
+   - Annual days supply > 360 per patient per year
+   - Non-covered NDCs (not 00173088710 or 00173089310)
+   - Government insurance claims receiving GSK benefit
+   - Max benefit repeat (patient+pharmacy hitting cap >=3 times)
 """
 import os
 N_PHARMACIES = 25
 N_CLAIMS = 5000
+# Trelegy NDCs (canonical 11-digit)
+NDC_LIST = ["00173089310", "00173088710"]
 NDC_PROB = [0.85, 0.15]
+# Non-covered NDC (for fraud embedding)
+NON_COVERED_NDCS = ["00173089999", "00173088888"]
 # Valid/suspicious specialties
 VALID_SPECIALTIES = ["Pulmonology", "Allergy/Immunology", "Internal Medicine", "Family Medicine", "Respiratory"]
 # Pharmacy subcategories
 PHARM_SUBCATS = ["Chain", "Independent", "Mail Order", "Clinic", "Long-Term Care", "Supermarket"]
+# Benefit caps for generating realistic benefit amounts
+GROUP_8141_CASH_CAP = {30: 100, 60: 200, 90: 300}
+GROUP_8141_COMM_APPROVED_CAP_PRE_2024 = {30: 200, 60: 400, 90: 600}
+GROUP_8141_COMM_APPROVED_CAP_ELEVATED = {30: 500, 60: 1000, 90: 1500}  # Jan-Mar 2024
+GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024 = {30: 200, 60: 400, 90: 600}
+GROUP_8141_COMM_REJECTED_CAP = {30: 100, 60: 200, 90: 300}
+GROUP_8200_COMM_APPROVED_CAP = {30: 645, 60: 1290, 90: 1935}
+def get_cap_for_claim(group_id, scenario, fill_date, days_supply):
+    """Get the benefit cap for a given group/scenario/date/ds combination."""
+    ds_key = 30 if days_supply <= 30 else (60 if days_supply <= 60 else 90)
+    if group_id == "8141":
+        if scenario in ["cash", "commercial_rejected"]:
+            return GROUP_8141_CASH_CAP[ds_key]
+        if scenario == "commercial_approved":
+            if fill_date < datetime(2024, 1, 1):
+                return GROUP_8141_COMM_APPROVED_CAP_PRE_2024[ds_key]
+            elif fill_date <= datetime(2024, 3, 31):
+                return GROUP_8141_COMM_APPROVED_CAP_ELEVATED[ds_key]
+            else:
+                return GROUP_8141_COMM_APPROVED_CAP_POST_MAR_2024[ds_key]
+    elif group_id == "8200":
+        if scenario == "commercial_approved":
+            return GROUP_8200_COMM_APPROVED_CAP[ds_key]
+    return 0
 def generate_patient_profiles(n):
     """Generate patient-level profiles with fill count distributions."""
+    fill_counts = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16],
+                                    size=n, p=[0.10, 0.15, 0.20, 0.20, 0.12, 0.08, 0.05, 0.03, 0.02, 0.02, 0.015, 0.015])
     fraud_one_done = np.random.random(n) < 0.05
     fill_counts = np.where(fraud_one_done, 1, fill_counts)
 def generate_hcp_profiles(n):
     """Generate HCP profiles with fraud concentration."""
     specialties = np.random.choice(ALL_SPECIALTIES, size=n, p=SPEC_PROBS)
     fraud_hcp = np.random.random(n) < 0.10
     hcps = pd.DataFrame({
         "hcp_id": [f"EL_HCP{str(i+1).zfill(4)}" for i in range(n)],
         "hcp_specialty": specialties,
         "is_fraud_hcp": fraud_hcp,
     })
     hcps.loc[hcps["is_fraud_hcp"], "hcp_specialty"] = np.random.choice(SUSPICIOUS_SPECIALTIES,
                                                                           size=hcps["is_fraud_hcp"].sum())
     return hcps
     """Generate pharmacy profiles with fraud concentration."""
     subcats = np.random.choice(PHARM_SUBCATS, size=n, p=[0.35, 0.30, 0.10, 0.10, 0.10, 0.05])
     states = np.random.choice(STATES, size=n)
     fraud_pharm = np.random.random(n) < 0.08
     pharm = pd.DataFrame({
         "pharmacy_id": [f"EL_PH{str(i+1).zfill(5)}" for i in range(n)],
 def generate_claims(patient_profiles, hcp_profiles, pharmacy_profiles, n_total=5000):
+    """Generate transaction-level claims with embedded fraud patterns including group-aware fraud."""
     records = []
     claim_counter = 0
     # Assign patients to HCPs (with concentration for fraud HCPs)
     for _, pat in patient_profiles.iterrows():
         n_fills = pat["expected_fills"]
         pat_id = pat["patient_id"]
         # Choose HCP(s)
         if pat_fraud:
             fraud_hcps = hcp_profiles[hcp_profiles["is_fraud_hcp"]]
             if len(fraud_hcps) > 0 and np.random.random() < 0.80:
                 hcp = fraud_hcps.sample(1).iloc[0]
             else:
                 hcp = hcp_profiles.sample(1).iloc[0]
         else:
+            hcp = hcp_profiles.sample(1).iloc[0]
         hcp_id = hcp["hcp_id"]
         hcp_spec = hcp["hcp_specialty"]
         # Choose pharmacy
         if pat_fraud or hcp_fraud:
             fraud_pharm = pharmacy_profiles[pharmacy_profiles["is_fraud_pharmacy"]]
             if len(fraud_pharm) > 0 and np.random.random() < 0.50:
                 pharm = fraud_pharm.sample(1).iloc[0]
             else:
                 pharm = pharmacy_profiles.sample(1).iloc[0]
         else:
             normal_pharm = pharmacy_profiles[pharmacy_profiles["pharmacy_subcategory"].isin(["Chain", "Independent", "Supermarket"])]
             if len(normal_pharm) > 0:
                 pharm = normal_pharm.sample(1).iloc[0]
             else:
+                pharm = pharmacy_profiles.sample(1).iloc[1]
         pharm_id = pharm["pharmacy_id"]
         pharm_state = pharm["pharmacy_state"]
         # Cross-state indicator
         cross_state = (pat_state != pharm_state)
+        # Group assignment: ~60% Group 8141, ~40% Group 8200
+        group_id = np.random.choice(["8141", "8200"], p=[0.60, 0.40])
         # Generate fill dates
         if n_fills == 1:
             base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 330))
             fill_dates = [base_date]
             base_date = datetime(2024, 1, 1) + timedelta(days=np.random.randint(0, 30))
             fill_dates = [base_date]
             for i in range(1, n_fills):
                 if pat_fraud and np.random.random() < 0.6:
                     gap = np.random.randint(5, 16)
                 else:
         for idx, fill_date in enumerate(fill_dates):
             claim_counter += 1
+            # Government insurance flag
             if pat_fraud or hcp_fraud:
                 govt_prob = 0.25
             else:
             is_govt = np.random.random() < govt_prob
             plan = np.random.choice(GOVT_PLANS) if is_govt else np.random.choice(COMMERCIAL_PLANS)
+            # Determine claim scenario
+            if is_govt:
+                # Government claims typically have some form of rejection or special handling
+                if np.random.random() < 0.3:
+                    scenario = "commercial_rejected"
+                    reject_code = np.random.choice(["76", "88", "79"])
+                else:
+                    scenario = "commercial_approved"
+                    reject_code = None
+            else:
+                # Normal commercial distribution
+                if np.random.random() < 0.08:
+                    scenario = "cash"
+                    reject_code = None
+                elif np.random.random() < 0.06:
+                    scenario = "commercial_rejected"
+                    reject_code = np.random.choice(["75", "77", "80"])
+                else:
+                    scenario = "commercial_approved"
+                    reject_code = None
             # NDC
             ndc = np.random.choice(NDC_LIST, p=NDC_PROB)
             # Fraud: occasionally switch NDC for fraud patients
             if pat_fraud and idx > 0 and np.random.random() < 0.3:
                 ndc = NDC_LIST[1] if ndc == NDC_LIST[0] else NDC_LIST[0]
+            # Non-covered NDC fraud: ~3% chance for fraud patients
+            if pat_fraud and np.random.random() < 0.15:
+                ndc = np.random.choice(NON_COVERED_NDCS)
+            # Quantity
             qty = 1
             if pat_fraud and np.random.random() < 0.2:
                 qty = 2
+            # Out of range fraud
+            if np.random.random() < 0.01:
+                qty = np.random.choice([0, 4, 5])
+            # Days supply
             ds = 30
             if pat_fraud and np.random.random() < 0.15:
                 ds = 90
+            # Out of range fraud
+            if np.random.random() < 0.01:
+                ds = np.random.choice([0, 95, 100])
+            # Calculate proper cap
+            cap = get_cap_for_claim(group_id, scenario, fill_date, ds)
+            # Financials — base on cap with some variation
+            if cap > 0:
+                # Normal: benefit is 70-100% of cap
+                benefit = cap * np.random.uniform(0.70, 1.0)
+                # Fraud: sometimes exceed cap by 10-50%
+                if pat_fraud and np.random.random() < 0.3:
+                    benefit = cap * np.random.uniform(1.10, 1.50)
+                # Group 8141 Jan-Mar elevated benefit fraud: use $500 cap outside that period
+                if group_id == "8141" and scenario == "commercial_approved" and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
+                    if np.random.random() < 0.1:  # 10% chance of wrong-period fraud
+                        elevated_cap = GROUP_8141_COMM_APPROVED_CAP_ELEVATED[30 if ds <= 30 else (60 if ds <= 60 else 90)]
+                        benefit = elevated_cap * np.random.uniform(0.8, 1.0)
+            else:
+                # Non-covered scenario (e.g., cash under Group 8200)
+                benefit = np.random.uniform(50, 300)  # Still paid (fraud!)
             usual_customary = np.random.uniform(500, 800)
             copay_before = usual_customary
             copay_after = max(0, copay_before - benefit)
+            # HCP fraud: higher benefit
             if hcp_fraud:
                 benefit += np.random.uniform(50, 200)
                 copay_after = max(0, copay_before - benefit)
+            # Paper submission
             paper_sub = 0
             if pat_fraud and np.random.random() < 0.3:
                 paper_sub = 1
             # DAW code
             daw = np.random.choice(["0", "1", "2"], p=[0.85, 0.10, 0.05])
+            # Claim type
             claim_type = np.random.choice(["N", "R", "A"], p=[0.94, 0.04, 0.02])
             if pat_fraud and np.random.random() < 0.1:
+                claim_type = "A"
             # Card ID
             card_id = f"EL_C{np.random.randint(100000, 999999)}"
             # RX number
             rx_num = f"EL_RX{np.random.randint(1000000, 9999999)}"
+            # Date written
             date_written = fill_date - timedelta(days=np.random.randint(0, 6))
             # BIN/PCN
             bin_num = f"{np.random.randint(100000, 999999)}"
             pcn = f"PCN{np.random.randint(10, 99)}"
+            # Payer status derived from scenario
+            payer_status_map = {
+                "cash": "CASH",
+                "commercial_approved": "APPROVED",
+                "commercial_rejected": "REJECTED",
+            }
+            payer_status = payer_status_map.get(scenario, "APPROVED")
+            # Is this claim fraudulent? (ground truth)
             is_fraud = 0
             fraud_type = ""
+            # Check group-aware fraud conditions
+            # Group 8200 non-covered scenario
+            if group_id == "8200" and scenario in ["cash", "commercial_rejected"]:
                 is_fraud = 1
+                fraud_type = "group_8200_non_covered_scenario"
+            # Benefit exceeds cap
+            elif cap > 0 and benefit > cap:
                 is_fraud = 1
+                fraud_type = "benefit_cap_exceeded"
+            # Group 8141 wrong-period elevated benefit
+            elif group_id == "8141" and scenario == "commercial_approved" and benefit > get_cap_for_claim("8141", "commercial_approved", fill_date, ds) and not (datetime(2024, 1, 1) <= fill_date <= datetime(2024, 3, 31)):
+                is_fraud = 1
+                fraud_type = "group_8141_wrong_period_benefit"
+            # Government insurance
             elif is_govt:
                 is_fraud = 1
                 fraud_type = "govt_insurance"
+            # Non-covered NDC
+            elif ndc in NON_COVERED_NDCS:
+                is_fraud = 1
+                fraud_type = "non_covered_ndc"
+            # Reject fraud
             elif reject_code in ["76", "88"]:
                 is_fraud = 1
                 fraud_type = "reject_fraud"
+            # Cross-state
             elif cross_state and np.random.random() < 0.3:
                 is_fraud = 1
                 fraud_type = "cross_state"
+            # Quantity/days supply anomaly
             elif qty != 1 or ds != 30:
                 is_fraud = 1
                 fraud_type = "quantity_anomaly"
+            # HCP fraud
+            elif hcp_fraud and (hcp_spec in SUSPICIOUS_SPECIALTIES):
+                is_fraud = 1
+                fraud_type = "hcp_fraud"
+            # Patient fraud
+            elif pat_fraud:
+                is_fraud = 1
+                fraud_type = "patient_fraud"
             record = {
                 # Patient
                 "SUBMISSION_METHOD": "PAPER" if paper_sub else "ELECTRONIC",
                 "SUBMISSION_TYPE": "NEW",
                 "PAYMENT_METHOD": "ACH",
+                # Payer status
+                "PAYER_STATUS": payer_status,
+                "CLAIM_SCENARIO": scenario,
                 # Program
+                "GROUP_ID": group_id,
                 # Metadata
                 "FILE_NAME": "elaad_test_data_2024.txt",
                 # Ground truth
 def main():
     os.makedirs("data", exist_ok=True)
+    print("Generating ELAAD-style synthetic test data (v4 Group-Aware)...")
     print(f"  Patients: {N_PATIENTS}")
     print(f"  HCPs: {N_HCPS}")
     print(f"  Pharmacies: {N_PHARMACIES}")
     # Summary
     fraud_rate = df["is_fraud"].mean()
     fraud_types = df["fraud_type"].value_counts().to_dict()
+    group_counts = df["GROUP_ID"].value_counts().to_dict()
+    scenario_counts = df["CLAIM_SCENARIO"].value_counts().to_dict()
     print(f"\nGenerated {len(df)} claims")
     print(f"  Fraud rate: {fraud_rate:.2%}")
     print(f"  Fraud types: {fraud_types}")
+    print(f"  Groups: {group_counts}")
+    print(f"  Scenarios: {scenario_counts}")
     print(f"  Patients: {df['PATIENT_ID'].nunique()}")
     print(f"  HCPs: {df['HCP_ID'].nunique()}")
     print(f"  Pharmacies: {df['PHARMACY_NABP_NUMBER'].nunique()}")