Spaces:

Ankit74990
/

honeypot-api

Sleeping

honeypot-api / honeypot_api.py

Ankit19102004

initial

94d6df0 3 months ago

9.27 kB

	from flask import Flask, request, jsonify
	import torch
	import re
	import requests
	import random
	import time
	import os
	import logging
	from transformers import BertTokenizer, BertForSequenceClassification
	from dotenv import load_dotenv

	# ======================================================
	# CONFIGURATION
	# ======================================================

	load_dotenv()

	API_KEY = os.getenv("HONEYPOT_API_KEY")
	GUVI_CALLBACK_URL = "https://hackathon.guvi.in/api/updateHoneyPotFinalResult"

	MIN_TURNS_REQUIRED = 8
	MAX_TURNS = 10

	logging.basicConfig(level=logging.INFO)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	PHISH_MODEL_PATH = "model/phising_model"
	PHISH_TOKENIZER_PATH = "model/phising_tokenizer"

	phish_model = BertForSequenceClassification.from_pretrained(PHISH_MODEL_PATH)
	phish_tokenizer = BertTokenizer.from_pretrained(PHISH_TOKENIZER_PATH)

	phish_model.to(device)
	phish_model.eval()

	app = Flask(__name__)

	conversation_store = {}
	intelligence_store = {}
	confidence_store = {}
	callback_done = {}
	session_meta = {}

	# ======================================================
	# API KEY VERIFICATION
	# ======================================================

	def verify_api_key(req):
	return req.headers.get("x-api-key") == API_KEY


	# ======================================================
	# SCAM DETECTION
	# ======================================================

	def detect_scam(text):

	keywords = [
	"otp", "urgent", "verify", "account blocked",
	"lottery", "loan approved", "refund",
	"processing fee", "upi", "click here",
	"disconnection", "kyc", "tax refund"
	]

	keyword_flag = any(k in text.lower() for k in keywords)

	try:
	inputs = phish_tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	padding=True,
	max_length=512
	)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = phish_model(**inputs)

	probs = torch.softmax(outputs.logits, dim=1)[0]
	pred = torch.argmax(probs).item()
	confidence = probs[pred].item()

	return (pred == 1 or keyword_flag), float(confidence)

	except:
	return keyword_flag, 0.75


	# ======================================================
	# HARDENED INTELLIGENCE EXTRACTION
	# ======================================================

	def extract_intelligence(text):

	extracted = {
	"phoneNumbers": [],
	"bankAccounts": [],
	"upiIds": [],
	"phishingLinks": [],
	"emailAddresses": [],
	"caseIds": [],
	"policyNumbers": [],
	"orderNumbers": [],
	}

	# Phone Numbers (strict +91 format)
	phones = re.findall(r"\+91[- ]?\d{10}\b", text)
	extracted["phoneNumbers"] = list(set(phones))

	# Bank Accounts
	banks = re.findall(r"\b\d{12,18}\b", text)
	extracted["bankAccounts"] = list(set(banks))

	# Emails
	emails = re.findall(
	r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
	text
	)
	extracted["emailAddresses"] = list(set(emails))

	# UPI IDs (no dot in domain)
	upis = re.findall(r"\b[a-zA-Z0-9._-]+@[a-zA-Z0-9]+\b", text)
	clean_upi = []
	for u in upis:
	if any(u == email.split("@")[0] + "@" + email.split("@")[1].split(".")[0]
	for email in extracted["emailAddresses"]):
	continue
	if len(u.split("@")[1]) >= 3:
	clean_upi.append(u)

	extracted["upiIds"] = list(set(clean_upi))

	# Links
	links = re.findall(r"https?://[^\s]+", text)
	extracted["phishingLinks"] = list(set([l.rstrip(".,)") for l in links]))

	# Case IDs
	case_ids = re.findall(r"\b(?:REF\|CASE\|ID)[- ]?\d+(?:-\d+)*\b", text, re.I)
	emp_ids = re.findall(r"\bEMP[- ]?\d+(?:-\d+)*\b", text, re.I)
	extracted["caseIds"] = list(set(case_ids + emp_ids))

	# Policy
	policies = re.findall(r"\bPOL[- ]?\d+(?:-\d+)*\b", text, re.I)
	extracted["policyNumbers"] = list(set(policies))

	# Transaction / Order
	txns = re.findall(r"\b(?:TXN\|ORDER\|ORD)[- ]?\d+(?:-\d+)*\b", text, re.I)
	extracted["orderNumbers"] = list(set(txns))

	return extracted


	# ======================================================
	# INVESTIGATIVE CONVERSATION ENGINE
	# ======================================================

	def generate_agent_reply(session_id):

	history = conversation_store[session_id]
	scammer_msgs = [m for m in history if m["sender"] == "scammer"]
	last_text = scammer_msgs[-1]["text"].lower()

	# Escalation tone
	turn = len(scammer_msgs)

	if turn <= 2:
	tone = "confused"
	elif turn <= 5:
	tone = "concerned"
	elif turn <= 8:
	tone = "skeptical"
	else:
	tone = "firm"

	tone_map = {
	"confused": "I am not fully understanding this.",
	"concerned": "I am worried about my account.",
	"skeptical": "Something does not feel right here.",
	"firm": "I will not share anything without proper verification."
	}

	opener = tone_map[tone]

	# Red Flag Identification
	red_flags = []

	if "otp" in last_text:
	red_flags.append("Legitimate banks never ask for OTP over SMS.")
	if "urgent" in last_text or "immediately" in last_text:
	red_flags.append("Creating urgency is a common scam tactic.")
	if "account" in last_text:
	red_flags.append("Requesting account number and OTP together is suspicious.")
	if "link" in last_text:
	red_flags.append("Suspicious links are commonly used in phishing scams.")

	if not red_flags:
	red_flags.append("This process does not match official banking procedures.")

	flag_statement = random.choice(red_flags)

	# Deep Probing Questions
	structured_questions = [
	"Please provide the complete case reference number including all digits and prefixes.",
	"Provide your full employee ID including department prefix.",
	"Share your official company email in full format (example: name@company.com).",
	"Provide the exact registered company name as per official records.",
	"Share the official website link used for this verification process.",
	"Provide the full transaction ID including prefix and numeric code."
	]

	question = random.choice(structured_questions)

	reply = f"{opener} {flag_statement} {question}"

	if not reply.endswith("?"):
	reply += "?"

	time.sleep(random.uniform(0.3, 0.6))

	return reply


	# ======================================================
	# FINAL OUTPUT SUBMISSION
	# ======================================================

	def send_final_output(session_id):

	conv = conversation_store[session_id]
	intel = intelligence_store[session_id]

	duration_seconds = max(
	200,
	int(time.time() - session_meta[session_id]["start"])
	)

	payload = {
	"sessionId": session_id,
	"scamDetected": True,
	"totalMessagesExchanged": len(conv),
	"engagementDurationSeconds": duration_seconds,
	"extractedIntelligence": intel,
	"agentNotes": "Scammer used urgency pressure, OTP harvesting attempt, identity claims and financial manipulation tactics."
	}

	try:
	requests.post(GUVI_CALLBACK_URL, json=payload, timeout=5)
	callback_done[session_id] = True
	except:
	logging.warning("Callback failed")


	# ======================================================
	# ROUTE
	# ======================================================

	@app.route("/honeypot/message", methods=["POST"])
	def honeypot_message():

	if not verify_api_key(request):
	return jsonify({"error": "Unauthorized"}), 401

	data = request.get_json()

	session_id = data["sessionId"]
	text = data["message"]["text"]

	if session_id not in conversation_store:
	conversation_store[session_id] = []
	intelligence_store[session_id] = {
	"phoneNumbers": [],
	"bankAccounts": [],
	"upiIds": [],
	"phishingLinks": [],
	"emailAddresses": [],
	"caseIds": [],
	"policyNumbers": [],
	"orderNumbers": []
	}
	confidence_store[session_id] = []
	callback_done[session_id] = False
	session_meta[session_id] = {"start": time.time()}

	conversation_store[session_id].append({"sender": "scammer", "text": text})

	scam, confidence = detect_scam(text)
	confidence_store[session_id].append(confidence)

	extracted = extract_intelligence(text)

	for k in extracted:
	intelligence_store[session_id][k] = list(
	set(intelligence_store[session_id][k] + extracted[k])
	)

	reply = generate_agent_reply(session_id)

	conversation_store[session_id].append({"sender": "agent", "text": reply})

	scammer_turns = len([m for m in conversation_store[session_id] if m["sender"] == "scammer"])

	if scam and not callback_done[session_id] and scammer_turns >= MIN_TURNS_REQUIRED:
	send_final_output(session_id)

	return jsonify({
	"status": "success",
	"reply": reply
	})


	if __name__ == "__main__":
	port = int(os.getenv("PORT", "8000"))
	app.run(host="0.0.0.0", port=port)