Ankit19102004 commited on
Commit
94d6df0
·
1 Parent(s): 3b5b2b6
Files changed (1) hide show
  1. honeypot_api.py +74 -133
honeypot_api.py CHANGED
@@ -18,7 +18,7 @@ load_dotenv()
18
  API_KEY = os.getenv("HONEYPOT_API_KEY")
19
  GUVI_CALLBACK_URL = "https://hackathon.guvi.in/api/updateHoneyPotFinalResult"
20
 
21
- MIN_TURNS_REQUIRED = 8 # ensures full Turn Count score
22
  MAX_TURNS = 10
23
 
24
  logging.basicConfig(level=logging.INFO)
@@ -49,20 +49,21 @@ session_meta = {}
49
  def verify_api_key(req):
50
  return req.headers.get("x-api-key") == API_KEY
51
 
 
52
  # ======================================================
53
- # SCAM DETECTION (GENERIC)
54
  # ======================================================
55
 
56
  def detect_scam(text):
57
 
58
- generic_keywords = [
59
  "otp", "urgent", "verify", "account blocked",
60
  "lottery", "loan approved", "refund",
61
  "processing fee", "upi", "click here",
62
  "disconnection", "kyc", "tax refund"
63
  ]
64
 
65
- keyword_flag = any(k in text.lower() for k in generic_keywords)
66
 
67
  try:
68
  inputs = phish_tokenizer(
@@ -81,16 +82,14 @@ def detect_scam(text):
81
  pred = torch.argmax(probs).item()
82
  confidence = probs[pred].item()
83
 
84
- scam_flag = (pred == 1) or keyword_flag
85
 
86
- return scam_flag, float(confidence)
 
87
 
88
- except Exception as e:
89
- logging.warning(f"Detection error: {e}")
90
- return keyword_flag, 0.7
91
 
92
  # ======================================================
93
- # INTELLIGENCE EXTRACTION
94
  # ======================================================
95
 
96
  def extract_intelligence(text):
@@ -106,109 +105,66 @@ def extract_intelligence(text):
106
  "orderNumbers": [],
107
  }
108
 
109
- # Strict Indian phone numbers only (+91XXXXXXXXXX or +91-XXXXXXXXXX)
110
  phones = re.findall(r"\+91[- ]?\d{10}\b", text)
111
  extracted["phoneNumbers"] = list(set(phones))
112
 
113
- # Bank accounts (12–18 digits only)
114
  banks = re.findall(r"\b\d{12,18}\b", text)
115
  extracted["bankAccounts"] = list(set(banks))
116
 
117
- # Email addresses
118
  emails = re.findall(
119
  r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
120
  text
121
  )
122
  extracted["emailAddresses"] = list(set(emails))
123
 
124
- # =========================
125
- # UPI IDs (strict format: no dot in domain)
126
- # =========================
127
- upi_matches = re.findall(r"\b[a-zA-Z0-9._-]+@[a-zA-Z0-9]+\b", text)
128
-
129
  clean_upi = []
130
- for u in upi_matches:
131
-
132
- # Reject if it matches part of a real email
133
- if any(
134
- u == email.split("@")[0] + "@" + email.split("@")[1].split(".")[0]
135
- for email in extracted["emailAddresses"]
136
- ):
137
- continue
138
-
139
- # Reject very short domains
140
- domain = u.split("@")[1]
141
- if len(domain) < 3:
142
  continue
143
-
144
- clean_upi.append(u)
145
 
146
  extracted["upiIds"] = list(set(clean_upi))
147
 
148
- # =========================
149
- # Phishing links
150
- # =========================
151
  links = re.findall(r"https?://[^\s]+", text)
152
- clean_links = [l.rstrip(".,)") for l in links]
153
- extracted["phishingLinks"] = list(set(clean_links))
154
-
155
- # =========================
156
- # Case IDs (REF, CASE, ID)
157
- # =========================
158
- case_ids = re.findall(
159
- r"\b(?:REF|CASE|ID)[- ]?\d+(?:-\d+)*\b",
160
- text,
161
- flags=re.IGNORECASE
162
- )
163
-
164
- # Employee IDs
165
- emp_ids = re.findall(
166
- r"\bEMP[- ]?\d+(?:-\d+)*\b",
167
- text,
168
- flags=re.IGNORECASE
169
- )
170
 
 
 
 
171
  extracted["caseIds"] = list(set(case_ids + emp_ids))
172
 
173
- # =========================
174
- # Policy numbers
175
- # =========================
176
- policies = re.findall(
177
- r"\bPOL[- ]?\d+(?:-\d+)*\b",
178
- text,
179
- flags=re.IGNORECASE
180
- )
181
  extracted["policyNumbers"] = list(set(policies))
182
 
183
- # =========================
184
- # Transaction / Order IDs
185
- # =========================
186
- txns = re.findall(
187
- r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b",
188
- text,
189
- flags=re.IGNORECASE
190
- )
191
  extracted["orderNumbers"] = list(set(txns))
192
 
193
  return extracted
 
 
194
  # ======================================================
195
- # HUMAN-LIKE CONVERSATION ENGINE
196
  # ======================================================
197
 
198
  def generate_agent_reply(session_id):
199
 
200
  history = conversation_store[session_id]
201
  scammer_msgs = [m for m in history if m["sender"] == "scammer"]
202
- turn = len(scammer_msgs)
203
  last_text = scammer_msgs[-1]["text"].lower()
204
 
205
- # Track asked categories
206
- if "asked_categories" not in session_meta[session_id]:
207
- session_meta[session_id]["asked_categories"] = set()
208
-
209
- asked = session_meta[session_id]["asked_categories"]
210
 
211
- # Escalation
212
  if turn <= 2:
213
  tone = "confused"
214
  elif turn <= 5:
@@ -218,67 +174,54 @@ def generate_agent_reply(session_id):
218
  else:
219
  tone = "firm"
220
 
221
- emotional_map = {
222
- "confused": ["I am not fully understanding this."],
223
- "concerned": ["I am worried about my account."],
224
- "skeptical": ["Something does not feel right here."],
225
- "firm": ["I will not share anything without proper verification."]
226
  }
227
 
228
- opener = random.choice(emotional_map[tone])
229
-
230
- # Aggressive elicitation order
231
- elicitation_priority = [
232
- ("phoneNumbers", "Can you provide your direct official contact number?"),
233
- ("emailAddresses", "What is your official company email address?"),
234
- ("bankAccounts", "Please resend the full bank account number clearly."),
235
- ("upiIds", "Can you resend the exact UPI ID?"),
236
- ("caseIds", "What is the official case reference number?"),
237
- ("policyNumbers", "What is the policy number linked to this?"),
238
- ("orderNumbers", "Is there any transaction or order ID?")
239
- ]
240
 
241
- intel = intelligence_store[session_id]
 
242
 
243
- question = None
 
 
 
 
 
 
 
244
 
245
- for key, q in elicitation_priority:
246
- if not intel.get(key) and key not in asked:
247
- question = q
248
- asked.add(key)
249
- break
250
 
251
- if not question:
252
- fallback_questions = [
253
- "Which branch are you calling from?",
254
- "What is your registered company name?",
255
- "Can you share the official website?",
256
- "Can you provide your employee ID again?"
257
- ]
258
- question = random.choice(fallback_questions)
259
 
260
- # Red flag statements
261
- red_flags = []
262
- if "otp" in last_text:
263
- red_flags.append("You are asking for my OTP which is extremely sensitive.")
264
- if "urgent" in last_text:
265
- red_flags.append("You are creating urgency which is suspicious.")
266
- if "fee" in last_text:
267
- red_flags.append("Why is there a fee involved?")
268
- if "link" in last_text:
269
- red_flags.append("The link you shared looks suspicious.")
270
 
271
- flag = random.choice(red_flags) if red_flags else ""
272
 
273
- reply = f"{opener} {flag} {question}"
274
- reply = re.sub(r"\s+", " ", reply).strip()
275
 
276
  if not reply.endswith("?"):
277
  reply += "?"
278
 
279
- time.sleep(random.uniform(0.3, 0.7))
280
 
281
  return reply
 
 
282
  # ======================================================
283
  # FINAL OUTPUT SUBMISSION
284
  # ======================================================
@@ -289,9 +232,9 @@ def send_final_output(session_id):
289
  intel = intelligence_store[session_id]
290
 
291
  duration_seconds = max(
292
- 200,
293
- int(time.time() - session_meta[session_id]["start"])
294
- )
295
 
296
  payload = {
297
  "sessionId": session_id,
@@ -299,14 +242,15 @@ def send_final_output(session_id):
299
  "totalMessagesExchanged": len(conv),
300
  "engagementDurationSeconds": duration_seconds,
301
  "extractedIntelligence": intel,
302
- "agentNotes": "Scammer used urgency, identity claims, payment redirection and sensitive data requests."
303
  }
304
 
305
  try:
306
  requests.post(GUVI_CALLBACK_URL, json=payload, timeout=5)
307
  callback_done[session_id] = True
308
- except Exception as e:
309
- logging.warning(f"Callback error: {e}")
 
310
 
311
  # ======================================================
312
  # ROUTE
@@ -365,10 +309,7 @@ def honeypot_message():
365
  "reply": reply
366
  })
367
 
368
- # ======================================================
369
- # RUN
370
- # ======================================================
371
 
372
  if __name__ == "__main__":
373
  port = int(os.getenv("PORT", "8000"))
374
- app.run(host="0.0.0.0", port=port)
 
18
  API_KEY = os.getenv("HONEYPOT_API_KEY")
19
  GUVI_CALLBACK_URL = "https://hackathon.guvi.in/api/updateHoneyPotFinalResult"
20
 
21
+ MIN_TURNS_REQUIRED = 8
22
  MAX_TURNS = 10
23
 
24
  logging.basicConfig(level=logging.INFO)
 
49
  def verify_api_key(req):
50
  return req.headers.get("x-api-key") == API_KEY
51
 
52
+
53
  # ======================================================
54
+ # SCAM DETECTION
55
  # ======================================================
56
 
57
  def detect_scam(text):
58
 
59
+ keywords = [
60
  "otp", "urgent", "verify", "account blocked",
61
  "lottery", "loan approved", "refund",
62
  "processing fee", "upi", "click here",
63
  "disconnection", "kyc", "tax refund"
64
  ]
65
 
66
+ keyword_flag = any(k in text.lower() for k in keywords)
67
 
68
  try:
69
  inputs = phish_tokenizer(
 
82
  pred = torch.argmax(probs).item()
83
  confidence = probs[pred].item()
84
 
85
+ return (pred == 1 or keyword_flag), float(confidence)
86
 
87
+ except:
88
+ return keyword_flag, 0.75
89
 
 
 
 
90
 
91
  # ======================================================
92
+ # HARDENED INTELLIGENCE EXTRACTION
93
  # ======================================================
94
 
95
  def extract_intelligence(text):
 
105
  "orderNumbers": [],
106
  }
107
 
108
+ # Phone Numbers (strict +91 format)
109
  phones = re.findall(r"\+91[- ]?\d{10}\b", text)
110
  extracted["phoneNumbers"] = list(set(phones))
111
 
112
+ # Bank Accounts
113
  banks = re.findall(r"\b\d{12,18}\b", text)
114
  extracted["bankAccounts"] = list(set(banks))
115
 
116
+ # Emails
117
  emails = re.findall(
118
  r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
119
  text
120
  )
121
  extracted["emailAddresses"] = list(set(emails))
122
 
123
+ # UPI IDs (no dot in domain)
124
+ upis = re.findall(r"\b[a-zA-Z0-9._-]+@[a-zA-Z0-9]+\b", text)
 
 
 
125
  clean_upi = []
126
+ for u in upis:
127
+ if any(u == email.split("@")[0] + "@" + email.split("@")[1].split(".")[0]
128
+ for email in extracted["emailAddresses"]):
 
 
 
 
 
 
 
 
 
129
  continue
130
+ if len(u.split("@")[1]) >= 3:
131
+ clean_upi.append(u)
132
 
133
  extracted["upiIds"] = list(set(clean_upi))
134
 
135
+ # Links
 
 
136
  links = re.findall(r"https?://[^\s]+", text)
137
+ extracted["phishingLinks"] = list(set([l.rstrip(".,)") for l in links]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ # Case IDs
140
+ case_ids = re.findall(r"\b(?:REF|CASE|ID)[- ]?\d+(?:-\d+)*\b", text, re.I)
141
+ emp_ids = re.findall(r"\bEMP[- ]?\d+(?:-\d+)*\b", text, re.I)
142
  extracted["caseIds"] = list(set(case_ids + emp_ids))
143
 
144
+ # Policy
145
+ policies = re.findall(r"\bPOL[- ]?\d+(?:-\d+)*\b", text, re.I)
 
 
 
 
 
 
146
  extracted["policyNumbers"] = list(set(policies))
147
 
148
+ # Transaction / Order
149
+ txns = re.findall(r"\b(?:TXN|ORDER|ORD)[- ]?\d+(?:-\d+)*\b", text, re.I)
 
 
 
 
 
 
150
  extracted["orderNumbers"] = list(set(txns))
151
 
152
  return extracted
153
+
154
+
155
  # ======================================================
156
+ # INVESTIGATIVE CONVERSATION ENGINE
157
  # ======================================================
158
 
159
  def generate_agent_reply(session_id):
160
 
161
  history = conversation_store[session_id]
162
  scammer_msgs = [m for m in history if m["sender"] == "scammer"]
 
163
  last_text = scammer_msgs[-1]["text"].lower()
164
 
165
+ # Escalation tone
166
+ turn = len(scammer_msgs)
 
 
 
167
 
 
168
  if turn <= 2:
169
  tone = "confused"
170
  elif turn <= 5:
 
174
  else:
175
  tone = "firm"
176
 
177
+ tone_map = {
178
+ "confused": "I am not fully understanding this.",
179
+ "concerned": "I am worried about my account.",
180
+ "skeptical": "Something does not feel right here.",
181
+ "firm": "I will not share anything without proper verification."
182
  }
183
 
184
+ opener = tone_map[tone]
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Red Flag Identification
187
+ red_flags = []
188
 
189
+ if "otp" in last_text:
190
+ red_flags.append("Legitimate banks never ask for OTP over SMS.")
191
+ if "urgent" in last_text or "immediately" in last_text:
192
+ red_flags.append("Creating urgency is a common scam tactic.")
193
+ if "account" in last_text:
194
+ red_flags.append("Requesting account number and OTP together is suspicious.")
195
+ if "link" in last_text:
196
+ red_flags.append("Suspicious links are commonly used in phishing scams.")
197
 
198
+ if not red_flags:
199
+ red_flags.append("This process does not match official banking procedures.")
 
 
 
200
 
201
+ flag_statement = random.choice(red_flags)
 
 
 
 
 
 
 
202
 
203
+ # Deep Probing Questions
204
+ structured_questions = [
205
+ "Please provide the complete case reference number including all digits and prefixes.",
206
+ "Provide your full employee ID including department prefix.",
207
+ "Share your official company email in full format (example: name@company.com).",
208
+ "Provide the exact registered company name as per official records.",
209
+ "Share the official website link used for this verification process.",
210
+ "Provide the full transaction ID including prefix and numeric code."
211
+ ]
 
212
 
213
+ question = random.choice(structured_questions)
214
 
215
+ reply = f"{opener} {flag_statement} {question}"
 
216
 
217
  if not reply.endswith("?"):
218
  reply += "?"
219
 
220
+ time.sleep(random.uniform(0.3, 0.6))
221
 
222
  return reply
223
+
224
+
225
  # ======================================================
226
  # FINAL OUTPUT SUBMISSION
227
  # ======================================================
 
232
  intel = intelligence_store[session_id]
233
 
234
  duration_seconds = max(
235
+ 200,
236
+ int(time.time() - session_meta[session_id]["start"])
237
+ )
238
 
239
  payload = {
240
  "sessionId": session_id,
 
242
  "totalMessagesExchanged": len(conv),
243
  "engagementDurationSeconds": duration_seconds,
244
  "extractedIntelligence": intel,
245
+ "agentNotes": "Scammer used urgency pressure, OTP harvesting attempt, identity claims and financial manipulation tactics."
246
  }
247
 
248
  try:
249
  requests.post(GUVI_CALLBACK_URL, json=payload, timeout=5)
250
  callback_done[session_id] = True
251
+ except:
252
+ logging.warning("Callback failed")
253
+
254
 
255
  # ======================================================
256
  # ROUTE
 
309
  "reply": reply
310
  })
311
 
 
 
 
312
 
313
  if __name__ == "__main__":
314
  port = int(os.getenv("PORT", "8000"))
315
+ app.run(host="0.0.0.0", port=port)