from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch app = FastAPI() # NEW MODEL: Multimodal Phishing Detector (URLs, SMS, Email) MODEL_ID = "ealvaradob/bert-finetuned-phishing" print("Loading model... This might take a minute as it's a 'large' BERT model.") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) class URLInput(BaseModel): url: str @app.get("/") async def root(): return {"status": "URL Phishing Detector API is running"} @app.post("/predict") async def predict_url(data: URLInput): # 1. Basic Pre-check if not data.url or len(data.url) < 4: return {"error": "Invalid URL provided"} # 2. Tokenize and Predict inputs = tokenizer(data.url, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) # Apply Softmax to get percentages predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) probs = predictions[0].tolist() # 3. Dynamic Label Mapping # The model usually uses LABEL_0 (Legitimate) and LABEL_1 (Phishing) confidences = {model.config.id2label[i]: prob for i, prob in enumerate(probs)} # Identify the highest confidence label max_label = max(confidences.items(), key=lambda x: x[1]) label_name = max_label[0] # Check for "LABEL_1" or "phishing" keyword in the output is_phishing = "1" in label_name or "phishing" in label_name.lower() return { "url": data.url, "prediction": "phishing" if is_phishing else "legitimate", "confidence": round(max_label[1], 4), "raw_scores": confidences, "is_malicious": is_phishing } if __name__ == "__main__": import uvicorn # 7860 is the standard port for Hugging Face Spaces uvicorn.run(app, host="0.0.0.0", port=7860)