URL_PART / app.py
Nick-2x's picture
Create app.py
103e422 verified
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
app = FastAPI()
# NEW MODEL: Multimodal Phishing Detector (URLs, SMS, Email)
MODEL_ID = "ealvaradob/bert-finetuned-phishing"
print("Loading model... This might take a minute as it's a 'large' BERT model.")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
class URLInput(BaseModel):
url: str
@app.get("/")
async def root():
return {"status": "URL Phishing Detector API is running"}
@app.post("/predict")
async def predict_url(data: URLInput):
# 1. Basic Pre-check
if not data.url or len(data.url) < 4:
return {"error": "Invalid URL provided"}
# 2. Tokenize and Predict
inputs = tokenizer(data.url, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Apply Softmax to get percentages
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
probs = predictions[0].tolist()
# 3. Dynamic Label Mapping
# The model usually uses LABEL_0 (Legitimate) and LABEL_1 (Phishing)
confidences = {model.config.id2label[i]: prob for i, prob in enumerate(probs)}
# Identify the highest confidence label
max_label = max(confidences.items(), key=lambda x: x[1])
label_name = max_label[0]
# Check for "LABEL_1" or "phishing" keyword in the output
is_phishing = "1" in label_name or "phishing" in label_name.lower()
return {
"url": data.url,
"prediction": "phishing" if is_phishing else "legitimate",
"confidence": round(max_label[1], 4),
"raw_scores": confidences,
"is_malicious": is_phishing
}
if __name__ == "__main__":
import uvicorn
# 7860 is the standard port for Hugging Face Spaces
uvicorn.run(app, host="0.0.0.0", port=7860)