pangweijlu's picture
Upload inference.py with huggingface_hub
5fc37d4 verified
"""
Inference script for multimodal fraudulent paper detection.
"""
import os
import sys
import torch
import numpy as np
from transformers import AutoTokenizer
import argparse
import json
from model import MultimodalFraudDetector
def predict_fraud(model, tokenizer, text, tabular, metadata, device):
"""Predict fraud probability for a single paper."""
model.eval()
# Tokenize text
encoding = tokenizer(
text,
max_length=512,
padding='max_length',
truncation=True,
return_tensors='pt'
)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
tabular = torch.tensor(tabular, dtype=torch.float32).unsqueeze(0).to(device)
metadata = torch.tensor(metadata, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(
text_input_ids=input_ids,
text_attention_mask=attention_mask,
tabular_features=tabular,
metadata_features=metadata
)
logits = outputs['logits']
probs = torch.softmax(logits, dim=1)
fraud_prob = probs[0, 1].item()
modality_scores = outputs['modality_scores'][0].cpu().numpy()
anomaly_score = outputs['anomaly_score'][0].item()
return {
'fraud_probability': fraud_prob,
'is_fraudulent': fraud_prob > 0.5,
'modality_contributions': {
'text': float(modality_scores[0]),
'image': float(modality_scores[1]),
'tabular': float(modality_scores[2]),
'metadata': float(modality_scores[3])
},
'anomaly_score': anomaly_score
}
def explain_prediction(result):
"""Generate human-readable explanation."""
explanations = []
if result['fraud_probability'] > 0.5:
explanations.append(f"FRAUDULENT (probability: {result['fraud_probability']:.2%})")
else:
explanations.append(f"AUTHENTIC (fraud probability: {result['fraud_probability']:.2%})")
# Modality contributions
contrib = result['modality_contributions']
max_modality = max(contrib, key=contrib.get)
explanations.append(f"Primary fraud indicator: {max_modality} modality (score: {contrib[max_modality]:.3f})")
if result['anomaly_score'] > 0.7:
explanations.append(f"High anomaly score ({result['anomaly_score']:.3f}): Paper shows strong outlier patterns")
return "\n".join(explanations)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', required=True)
parser.add_argument('--text', default='')
parser.add_argument('--title', default='')
parser.add_argument('--output', default='prediction.json')
args = parser.parse_args()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load model
checkpoint = torch.load(args.model_path, map_location=device)
model_args = checkpoint.get('args', {})
model = MultimodalFraudDetector(
text_model=model_args.get('text_model', 'allenai/scibert_scivocab_uncased'),
tabular_features=10,
metadata_features=12
).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_args.get('text_model'))
# Prepare input
text = f"{args.title} [SEP] {args.text}"
# Dummy features for demo (in production, extract from actual paper)
tabular = np.random.randn(10).astype(np.float32)
metadata = np.random.randn(12).astype(np.float32)
# Predict
result = predict_fraud(model, tokenizer, text, tabular, metadata, device)
result['explanation'] = explain_prediction(result)
print(result['explanation'])
with open(args.output, 'w') as f:
json.dump(result, f, indent=2)
print(f"\nSaved to {args.output}")
if __name__ == '__main__':
main()