Spaces:

Nick-2x
/

URL_PART

Sleeping

App Files Files Community

URL_PART / app.py

Nick-2x

Create app.py

103e422 verified 16 days ago

raw

history blame contribute delete

1.98 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch

	app = FastAPI()

	# NEW MODEL: Multimodal Phishing Detector (URLs, SMS, Email)
	MODEL_ID = "ealvaradob/bert-finetuned-phishing"

	print("Loading model... This might take a minute as it's a 'large' BERT model.")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)

	class URLInput(BaseModel):
	url: str

	@app.get("/")
	async def root():
	return {"status": "URL Phishing Detector API is running"}

	@app.post("/predict")
	async def predict_url(data: URLInput):
	# 1. Basic Pre-check
	if not data.url or len(data.url) < 4:
	return {"error": "Invalid URL provided"}

	# 2. Tokenize and Predict
	inputs = tokenizer(data.url, return_tensors="pt", truncation=True, max_length=512)

	with torch.no_grad():
	outputs = model(**inputs)
	# Apply Softmax to get percentages
	predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

	probs = predictions[0].tolist()

	# 3. Dynamic Label Mapping
	# The model usually uses LABEL_0 (Legitimate) and LABEL_1 (Phishing)
	confidences = {model.config.id2label[i]: prob for i, prob in enumerate(probs)}

	# Identify the highest confidence label
	max_label = max(confidences.items(), key=lambda x: x[1])
	label_name = max_label[0]

	# Check for "LABEL_1" or "phishing" keyword in the output
	is_phishing = "1" in label_name or "phishing" in label_name.lower()

	return {
	"url": data.url,
	"prediction": "phishing" if is_phishing else "legitimate",
	"confidence": round(max_label[1], 4),
	"raw_scores": confidences,
	"is_malicious": is_phishing
	}

	if __name__ == "__main__":
	import uvicorn
	# 7860 is the standard port for Hugging Face Spaces
	uvicorn.run(app, host="0.0.0.0", port=7860)