| |
| import torch |
| import numpy as np |
| import joblib |
| import json |
| from transformers import DistilBertTokenizerFast, DistilBertModel |
|
|
| class Predictor: |
| def __init__(self, model_path='xgboost_model.joblib', mappings_path='label_mappings.json', device=None): |
| |
| self.model = joblib.load(model_path) |
|
|
| |
| with open(mappings_path, 'r') as f: |
| mappings = json.load(f) |
| self.id2label = {int(k): v for k, v in mappings['id2label'].items()} |
|
|
| |
| self.tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") |
| self.embedding_model = DistilBertModel.from_pretrained("distilbert-base-uncased") |
| self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| self.embedding_model.to(self.device) |
|
|
| def generate_embedding(self, text): |
| inputs = self.tokenizer( |
| [text], |
| padding=True, |
| truncation=True, |
| max_length=128, |
| return_tensors="pt" |
| ).to(self.device) |
| with torch.no_grad(): |
| outputs = self.embedding_model(**inputs) |
| embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() |
| return embedding |
|
|
| def predict(self, text): |
| embedding = self.generate_embedding(text) |
| y_pred = self.model.predict(embedding) |
| predictions = {self.id2label[i]: float(y_pred[0][i]) for i in range(len(self.id2label))} |
| return predictions |
|
|
| |
| if __name__ == "__main__": |
| predictor = Predictor() |
| text = "I write about American politics" |
| predictions = predictor.predict(text) |
| print(predictions) |
|
|