| """ |
| Inference script for multimodal fraudulent paper detection. |
| """ |
|
|
| import os |
| import sys |
| import torch |
| import numpy as np |
| from transformers import AutoTokenizer |
| import argparse |
| import json |
|
|
| from model import MultimodalFraudDetector |
|
|
|
|
| def predict_fraud(model, tokenizer, text, tabular, metadata, device): |
| """Predict fraud probability for a single paper.""" |
| model.eval() |
| |
| |
| encoding = tokenizer( |
| text, |
| max_length=512, |
| padding='max_length', |
| truncation=True, |
| return_tensors='pt' |
| ) |
| |
| input_ids = encoding['input_ids'].to(device) |
| attention_mask = encoding['attention_mask'].to(device) |
| tabular = torch.tensor(tabular, dtype=torch.float32).unsqueeze(0).to(device) |
| metadata = torch.tensor(metadata, dtype=torch.float32).unsqueeze(0).to(device) |
| |
| with torch.no_grad(): |
| outputs = model( |
| text_input_ids=input_ids, |
| text_attention_mask=attention_mask, |
| tabular_features=tabular, |
| metadata_features=metadata |
| ) |
| |
| logits = outputs['logits'] |
| probs = torch.softmax(logits, dim=1) |
| fraud_prob = probs[0, 1].item() |
| |
| modality_scores = outputs['modality_scores'][0].cpu().numpy() |
| anomaly_score = outputs['anomaly_score'][0].item() |
| |
| return { |
| 'fraud_probability': fraud_prob, |
| 'is_fraudulent': fraud_prob > 0.5, |
| 'modality_contributions': { |
| 'text': float(modality_scores[0]), |
| 'image': float(modality_scores[1]), |
| 'tabular': float(modality_scores[2]), |
| 'metadata': float(modality_scores[3]) |
| }, |
| 'anomaly_score': anomaly_score |
| } |
|
|
|
|
| def explain_prediction(result): |
| """Generate human-readable explanation.""" |
| explanations = [] |
| |
| if result['fraud_probability'] > 0.5: |
| explanations.append(f"FRAUDULENT (probability: {result['fraud_probability']:.2%})") |
| else: |
| explanations.append(f"AUTHENTIC (fraud probability: {result['fraud_probability']:.2%})") |
| |
| |
| contrib = result['modality_contributions'] |
| max_modality = max(contrib, key=contrib.get) |
| explanations.append(f"Primary fraud indicator: {max_modality} modality (score: {contrib[max_modality]:.3f})") |
| |
| if result['anomaly_score'] > 0.7: |
| explanations.append(f"High anomaly score ({result['anomaly_score']:.3f}): Paper shows strong outlier patterns") |
| |
| return "\n".join(explanations) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--model_path', required=True) |
| parser.add_argument('--text', default='') |
| parser.add_argument('--title', default='') |
| parser.add_argument('--output', default='prediction.json') |
| args = parser.parse_args() |
| |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| |
| |
| checkpoint = torch.load(args.model_path, map_location=device) |
| model_args = checkpoint.get('args', {}) |
| |
| model = MultimodalFraudDetector( |
| text_model=model_args.get('text_model', 'allenai/scibert_scivocab_uncased'), |
| tabular_features=10, |
| metadata_features=12 |
| ).to(device) |
| |
| model.load_state_dict(checkpoint['model_state_dict']) |
| model.eval() |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_args.get('text_model')) |
| |
| |
| text = f"{args.title} [SEP] {args.text}" |
| |
| |
| tabular = np.random.randn(10).astype(np.float32) |
| metadata = np.random.randn(12).astype(np.float32) |
| |
| |
| result = predict_fraud(model, tokenizer, text, tabular, metadata, device) |
| result['explanation'] = explain_prediction(result) |
| |
| print(result['explanation']) |
| |
| with open(args.output, 'w') as f: |
| json.dump(result, f, indent=2) |
| |
| print(f"\nSaved to {args.output}") |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|