Ankit19102004 commited on
Commit
dd7d86b
·
1 Parent(s): 3b11c51
Files changed (1) hide show
  1. honeypot_api.py +65 -16
honeypot_api.py CHANGED
@@ -95,26 +95,75 @@ def detect_scam(text):
95
 
96
  def extract_intelligence(text):
97
 
98
- patterns = {
99
- "phoneNumbers": r"\b\+?\d{1,3}[- ]?\d{10}\b",
100
- "bankAccounts": r"\b\d{12,18}\b",
101
- "upiIds": r"[a-zA-Z0-9.\-_+]+@[a-zA-Z]+",
102
- "phishingLinks": r"https?://[^\s]+",
103
- "emailAddresses": r"[a-zA-Z0-9.\-_+]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]+",
104
- "caseIds": r"\b(?:CASE|REF|ID|TICKET)[- ]?[A-Z0-9]{4,}\b",
105
- "policyNumbers": r"\b(?:POLICY|POL|INS)[- ]?[A-Z0-9]{4,}\b",
106
- "orderNumbers": r"\b(?:ORDER|ORD)[- ]?[A-Z0-9]{4,}\b",
107
  }
108
 
109
- extracted = {k: [] for k in patterns}
110
-
111
- for key, pattern in patterns.items():
112
- matches = re.findall(pattern, text)
113
- if matches:
114
- extracted[key] = list(set(matches))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  return extracted
117
-
118
  # ======================================================
119
  # HUMAN-LIKE CONVERSATION ENGINE
120
  # ======================================================
 
95
 
96
  def extract_intelligence(text):
97
 
98
+ extracted = {
99
+ "phoneNumbers": [],
100
+ "bankAccounts": [],
101
+ "upiIds": [],
102
+ "phishingLinks": [],
103
+ "emailAddresses": [],
104
+ "caseIds": [],
105
+ "policyNumbers": [],
106
+ "orderNumbers": [],
107
  }
108
 
109
+ # Phone Numbers
110
+ phone_matches = re.findall(r"\+?\d{1,3}[- ]?\d{10}", text)
111
+ extracted["phoneNumbers"] = list(set(phone_matches))
112
+
113
+ # Bank Accounts (12–18 digits only)
114
+ bank_matches = re.findall(r"\b\d{12,18}\b", text)
115
+ extracted["bankAccounts"] = list(set(bank_matches))
116
+
117
+ # Email Addresses (strict)
118
+ email_matches = re.findall(
119
+ r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
120
+ text
121
+ )
122
+ extracted["emailAddresses"] = list(set(email_matches))
123
+
124
+ # UPI IDs (exclude emails with dot-domain)
125
+ upi_matches = re.findall(
126
+ r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)?\b",
127
+ text
128
+ )
129
+
130
+ clean_upi = []
131
+ for u in upi_matches:
132
+ if u not in extracted["emailAddresses"] and "." not in u.split("@")[1]:
133
+ clean_upi.append(u)
134
+
135
+ extracted["upiIds"] = list(set(clean_upi))
136
+
137
+ # Phishing Links (remove trailing punctuation)
138
+ link_matches = re.findall(r"https?://[^\s]+", text)
139
+ clean_links = [l.rstrip(".,)") for l in link_matches]
140
+ extracted["phishingLinks"] = list(set(clean_links))
141
+
142
+ # Case IDs (full capture including suffix)
143
+ case_matches = re.findall(
144
+ r"\b(?:CASE|REF|ID|TICKET)[- ]?\d+(?:-\d+)*\b",
145
+ text,
146
+ flags=re.IGNORECASE
147
+ )
148
+ extracted["caseIds"] = list(set(case_matches))
149
+
150
+ # Policy Numbers (full capture)
151
+ policy_matches = re.findall(
152
+ r"\b(?:POLICY|POL|INS)[- ]?\d+(?:-\d+)*\b",
153
+ text,
154
+ flags=re.IGNORECASE
155
+ )
156
+ extracted["policyNumbers"] = list(set(policy_matches))
157
+
158
+ # Order / Transaction IDs
159
+ order_matches = re.findall(
160
+ r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
161
+ text,
162
+ flags=re.IGNORECASE
163
+ )
164
+ extracted["orderNumbers"] = list(set(order_matches))
165
 
166
  return extracted
 
167
  # ======================================================
168
  # HUMAN-LIKE CONVERSATION ENGINE
169
  # ======================================================