Spaces:

Ankit74990
/

honeypot-api

Sleeping

App Files Files Community

Ankit19102004 commited on Feb 20

Commit

3891fd2

1 Parent(s): dd7d86b

initial

Browse files

Files changed (1) hide show

honeypot_api.py +35 -32

honeypot_api.py CHANGED Viewed

@@ -106,62 +106,65 @@ def extract_intelligence(text):
         "orderNumbers": [],
     }
-    # Phone Numbers
-    phone_matches = re.findall(r"\+?\d{1,3}[- ]?\d{10}", text)
-    extracted["phoneNumbers"] = list(set(phone_matches))
-    # Bank Accounts (12–18 digits only)
-    bank_matches = re.findall(r"\b\d{12,18}\b", text)
-    extracted["bankAccounts"] = list(set(bank_matches))
-    # Email Addresses (strict)
-    email_matches = re.findall(
         r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
         text
     )
-    extracted["emailAddresses"] = list(set(email_matches))
-    # UPI IDs (exclude emails with dot-domain)
-    upi_matches = re.findall(
-        r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)?\b",
-        text
-    )
     clean_upi = []
-    for u in upi_matches:
-        if u not in extracted["emailAddresses"] and "." not in u.split("@")[1]:
             clean_upi.append(u)
     extracted["upiIds"] = list(set(clean_upi))
-    # Phishing Links (remove trailing punctuation)
-    link_matches = re.findall(r"https?://[^\s]+", text)
-    clean_links = [l.rstrip(".,)") for l in link_matches]
     extracted["phishingLinks"] = list(set(clean_links))
-    # Case IDs (full capture including suffix)
-    case_matches = re.findall(
-        r"\b(?:CASE|REF|ID|TICKET)[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
-    extracted["caseIds"] = list(set(case_matches))
-    # Policy Numbers (full capture)
-    policy_matches = re.findall(
-        r"\b(?:POLICY|POL|INS)[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
-    extracted["policyNumbers"] = list(set(policy_matches))
-    # Order / Transaction IDs
-    order_matches = re.findall(
         r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
-    extracted["orderNumbers"] = list(set(order_matches))
     return extracted
 # ======================================================

         "orderNumbers": [],
     }
+    # Strict Indian phone numbers only (+91-XXXXXXXXXX)
+    phones = re.findall(r"\+91[- ]?\d{10}\b", text)
+    extracted["phoneNumbers"] = list(set(phones))
+    # Bank accounts (12–18 digits only)
+    banks = re.findall(r"\b\d{12,18}\b", text)
+    extracted["bankAccounts"] = list(set(banks))
+    # Email addresses
+    emails = re.findall(
         r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
         text
     )
+    extracted["emailAddresses"] = list(set(emails))
+    # UPI IDs (no dot-domain)
+    upis = re.findall(r"\b[a-zA-Z0-9._-]+@[a-zA-Z]+\b", text)
     clean_upi = []
+    for u in upis:
+        if u not in extracted["emailAddresses"]:
             clean_upi.append(u)
     extracted["upiIds"] = list(set(clean_upi))
+    # Phishing links
+    links = re.findall(r"https?://[^\s]+", text)
+    clean_links = [l.rstrip(".,)") for l in links]
     extracted["phishingLinks"] = list(set(clean_links))
+    # Case IDs (REF, CASE, ID)
+    case_ids = re.findall(
+        r"\b(?:REF|CASE|ID)[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
+    # Employee IDs
+    emp_ids = re.findall(
+        r"\bEMP[- ]?\d+(?:-\d+)*\b",
+        text,
+        flags=re.IGNORECASE
+    )
+    extracted["caseIds"] = list(set(case_ids + emp_ids))
+    # Policy numbers
+    policies = re.findall(
+        r"\bPOL[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
+    extracted["policyNumbers"] = list(set(policies))
+    # Transaction / Order IDs
+    txns = re.findall(
         r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
         text,
         flags=re.IGNORECASE
     )
+    extracted["orderNumbers"] = list(set(txns))
     return extracted
 # ======================================================