File size: 3,965 Bytes
5fc37d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Inference script for multimodal fraudulent paper detection.
"""

import os
import sys
import torch
import numpy as np
from transformers import AutoTokenizer
import argparse
import json

from model import MultimodalFraudDetector


def predict_fraud(model, tokenizer, text, tabular, metadata, device):
    """Predict fraud probability for a single paper."""
    model.eval()
    
    # Tokenize text
    encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    tabular = torch.tensor(tabular, dtype=torch.float32).unsqueeze(0).to(device)
    metadata = torch.tensor(metadata, dtype=torch.float32).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(
            text_input_ids=input_ids,
            text_attention_mask=attention_mask,
            tabular_features=tabular,
            metadata_features=metadata
        )
    
    logits = outputs['logits']
    probs = torch.softmax(logits, dim=1)
    fraud_prob = probs[0, 1].item()
    
    modality_scores = outputs['modality_scores'][0].cpu().numpy()
    anomaly_score = outputs['anomaly_score'][0].item()
    
    return {
        'fraud_probability': fraud_prob,
        'is_fraudulent': fraud_prob > 0.5,
        'modality_contributions': {
            'text': float(modality_scores[0]),
            'image': float(modality_scores[1]),
            'tabular': float(modality_scores[2]),
            'metadata': float(modality_scores[3])
        },
        'anomaly_score': anomaly_score
    }


def explain_prediction(result):
    """Generate human-readable explanation."""
    explanations = []
    
    if result['fraud_probability'] > 0.5:
        explanations.append(f"FRAUDULENT (probability: {result['fraud_probability']:.2%})")
    else:
        explanations.append(f"AUTHENTIC (fraud probability: {result['fraud_probability']:.2%})")
    
    # Modality contributions
    contrib = result['modality_contributions']
    max_modality = max(contrib, key=contrib.get)
    explanations.append(f"Primary fraud indicator: {max_modality} modality (score: {contrib[max_modality]:.3f})")
    
    if result['anomaly_score'] > 0.7:
        explanations.append(f"High anomaly score ({result['anomaly_score']:.3f}): Paper shows strong outlier patterns")
    
    return "\n".join(explanations)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', required=True)
    parser.add_argument('--text', default='')
    parser.add_argument('--title', default='')
    parser.add_argument('--output', default='prediction.json')
    args = parser.parse_args()
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Load model
    checkpoint = torch.load(args.model_path, map_location=device)
    model_args = checkpoint.get('args', {})
    
    model = MultimodalFraudDetector(
        text_model=model_args.get('text_model', 'allenai/scibert_scivocab_uncased'),
        tabular_features=10,
        metadata_features=12
    ).to(device)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained(model_args.get('text_model'))
    
    # Prepare input
    text = f"{args.title} [SEP] {args.text}"
    
    # Dummy features for demo (in production, extract from actual paper)
    tabular = np.random.randn(10).astype(np.float32)
    metadata = np.random.randn(12).astype(np.float32)
    
    # Predict
    result = predict_fraud(model, tokenizer, text, tabular, metadata, device)
    result['explanation'] = explain_prediction(result)
    
    print(result['explanation'])
    
    with open(args.output, 'w') as f:
        json.dump(result, f, indent=2)
    
    print(f"\nSaved to {args.output}")


if __name__ == '__main__':
    main()