File size: 3,965 Bytes
5fc37d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
Inference script for multimodal fraudulent paper detection.
"""
import os
import sys
import torch
import numpy as np
from transformers import AutoTokenizer
import argparse
import json
from model import MultimodalFraudDetector
def predict_fraud(model, tokenizer, text, tabular, metadata, device):
"""Predict fraud probability for a single paper."""
model.eval()
# Tokenize text
encoding = tokenizer(
text,
max_length=512,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
tabular = torch.tensor(tabular, dtype=torch.float32).unsqueeze(0).to(device)
metadata = torch.tensor(metadata, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(
text_input_ids=input_ids,
text_attention_mask=attention_mask,
tabular_features=tabular,
metadata_features=metadata
)
logits = outputs['logits']
probs = torch.softmax(logits, dim=1)
fraud_prob = probs[0, 1].item()
modality_scores = outputs['modality_scores'][0].cpu().numpy()
anomaly_score = outputs['anomaly_score'][0].item()
return {
'fraud_probability': fraud_prob,
'is_fraudulent': fraud_prob > 0.5,
'modality_contributions': {
'text': float(modality_scores[0]),
'image': float(modality_scores[1]),
'tabular': float(modality_scores[2]),
'metadata': float(modality_scores[3])
},
'anomaly_score': anomaly_score
}
def explain_prediction(result):
"""Generate human-readable explanation."""
explanations = []
if result['fraud_probability'] > 0.5:
explanations.append(f"FRAUDULENT (probability: {result['fraud_probability']:.2%})")
else:
explanations.append(f"AUTHENTIC (fraud probability: {result['fraud_probability']:.2%})")
# Modality contributions
contrib = result['modality_contributions']
max_modality = max(contrib, key=contrib.get)
explanations.append(f"Primary fraud indicator: {max_modality} modality (score: {contrib[max_modality]:.3f})")
if result['anomaly_score'] > 0.7:
explanations.append(f"High anomaly score ({result['anomaly_score']:.3f}): Paper shows strong outlier patterns")
return "\n".join(explanations)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', required=True)
parser.add_argument('--text', default='')
parser.add_argument('--title', default='')
parser.add_argument('--output', default='prediction.json')
args = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load model
checkpoint = torch.load(args.model_path, map_location=device)
model_args = checkpoint.get('args', {})
model = MultimodalFraudDetector(
text_model=model_args.get('text_model', 'allenai/scibert_scivocab_uncased'),
tabular_features=10,
metadata_features=12
).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_args.get('text_model'))
# Prepare input
text = f"{args.title} [SEP] {args.text}"
# Dummy features for demo (in production, extract from actual paper)
tabular = np.random.randn(10).astype(np.float32)
metadata = np.random.randn(12).astype(np.float32)
# Predict
result = predict_fraud(model, tokenizer, text, tabular, metadata, device)
result['explanation'] = explain_prediction(result)
print(result['explanation'])
with open(args.output, 'w') as f:
json.dump(result, f, indent=2)
print(f"\nSaved to {args.output}")
if __name__ == '__main__':
main()
|