Ankit19102004 commited on
Commit
3891fd2
·
1 Parent(s): dd7d86b
Files changed (1) hide show
  1. honeypot_api.py +35 -32
honeypot_api.py CHANGED
@@ -106,62 +106,65 @@ def extract_intelligence(text):
106
  "orderNumbers": [],
107
  }
108
 
109
- # Phone Numbers
110
- phone_matches = re.findall(r"\+?\d{1,3}[- ]?\d{10}", text)
111
- extracted["phoneNumbers"] = list(set(phone_matches))
112
 
113
- # Bank Accounts (12–18 digits only)
114
- bank_matches = re.findall(r"\b\d{12,18}\b", text)
115
- extracted["bankAccounts"] = list(set(bank_matches))
116
 
117
- # Email Addresses (strict)
118
- email_matches = re.findall(
119
  r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
120
  text
121
  )
122
- extracted["emailAddresses"] = list(set(email_matches))
123
-
124
- # UPI IDs (exclude emails with dot-domain)
125
- upi_matches = re.findall(
126
- r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)?\b",
127
- text
128
- )
129
 
 
 
130
  clean_upi = []
131
- for u in upi_matches:
132
- if u not in extracted["emailAddresses"] and "." not in u.split("@")[1]:
133
  clean_upi.append(u)
134
-
135
  extracted["upiIds"] = list(set(clean_upi))
136
 
137
- # Phishing Links (remove trailing punctuation)
138
- link_matches = re.findall(r"https?://[^\s]+", text)
139
- clean_links = [l.rstrip(".,)") for l in link_matches]
140
  extracted["phishingLinks"] = list(set(clean_links))
141
 
142
- # Case IDs (full capture including suffix)
143
- case_matches = re.findall(
144
- r"\b(?:CASE|REF|ID|TICKET)[- ]?\d+(?:-\d+)*\b",
145
  text,
146
  flags=re.IGNORECASE
147
  )
148
- extracted["caseIds"] = list(set(case_matches))
149
 
150
- # Policy Numbers (full capture)
151
- policy_matches = re.findall(
152
- r"\b(?:POLICY|POL|INS)[- ]?\d+(?:-\d+)*\b",
 
 
 
 
 
 
 
 
 
153
  text,
154
  flags=re.IGNORECASE
155
  )
156
- extracted["policyNumbers"] = list(set(policy_matches))
157
 
158
- # Order / Transaction IDs
159
- order_matches = re.findall(
160
  r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
161
  text,
162
  flags=re.IGNORECASE
163
  )
164
- extracted["orderNumbers"] = list(set(order_matches))
165
 
166
  return extracted
167
  # ======================================================
 
106
  "orderNumbers": [],
107
  }
108
 
109
+ # Strict Indian phone numbers only (+91-XXXXXXXXXX)
110
+ phones = re.findall(r"\+91[- ]?\d{10}\b", text)
111
+ extracted["phoneNumbers"] = list(set(phones))
112
 
113
+ # Bank accounts (12–18 digits only)
114
+ banks = re.findall(r"\b\d{12,18}\b", text)
115
+ extracted["bankAccounts"] = list(set(banks))
116
 
117
+ # Email addresses
118
+ emails = re.findall(
119
  r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
120
  text
121
  )
122
+ extracted["emailAddresses"] = list(set(emails))
 
 
 
 
 
 
123
 
124
+ # UPI IDs (no dot-domain)
125
+ upis = re.findall(r"\b[a-zA-Z0-9._-]+@[a-zA-Z]+\b", text)
126
  clean_upi = []
127
+ for u in upis:
128
+ if u not in extracted["emailAddresses"]:
129
  clean_upi.append(u)
 
130
  extracted["upiIds"] = list(set(clean_upi))
131
 
132
+ # Phishing links
133
+ links = re.findall(r"https?://[^\s]+", text)
134
+ clean_links = [l.rstrip(".,)") for l in links]
135
  extracted["phishingLinks"] = list(set(clean_links))
136
 
137
+ # Case IDs (REF, CASE, ID)
138
+ case_ids = re.findall(
139
+ r"\b(?:REF|CASE|ID)[- ]?\d+(?:-\d+)*\b",
140
  text,
141
  flags=re.IGNORECASE
142
  )
 
143
 
144
+ # Employee IDs
145
+ emp_ids = re.findall(
146
+ r"\bEMP[- ]?\d+(?:-\d+)*\b",
147
+ text,
148
+ flags=re.IGNORECASE
149
+ )
150
+
151
+ extracted["caseIds"] = list(set(case_ids + emp_ids))
152
+
153
+ # Policy numbers
154
+ policies = re.findall(
155
+ r"\bPOL[- ]?\d+(?:-\d+)*\b",
156
  text,
157
  flags=re.IGNORECASE
158
  )
159
+ extracted["policyNumbers"] = list(set(policies))
160
 
161
+ # Transaction / Order IDs
162
+ txns = re.findall(
163
  r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
164
  text,
165
  flags=re.IGNORECASE
166
  )
167
+ extracted["orderNumbers"] = list(set(txns))
168
 
169
  return extracted
170
  # ======================================================