| from transformers import AutoModel, AutoTokenizer |
| import torch |
| import numpy as np |
| from sklearn.linear_model import LogisticRegression |
| import joblib |
|
|
|
|
|
|
| |
| rubert_model_name = "cointegrated/rubert-tiny2" |
| tokenizer = AutoTokenizer.from_pretrained(rubert_model_name) |
| model = AutoModel.from_pretrained(rubert_model_name) |
|
|
| |
| logreg_model_path = "model_data/logreg_model_v2.joblib" |
| logreg_model = joblib.load(logreg_model_path) |
|
|
| def embed_bert_cls(text, model, tokenizer): |
| """Generate embeddings for input text using the RuBERT model.""" |
| inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt") |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| embeddings = outputs.last_hidden_state[:, 0, :] |
| embeddings = torch.nn.functional.normalize(embeddings) |
| return embeddings.cpu().numpy() |
|
|
| def classify_text(text, model = model, tokenizer = tokenizer, classifier = logreg_model): |
| """Classify text as toxic or non-toxic using embeddings from RuBERT and Logistic Regression.""" |
| embeddings = embed_bert_cls(text, model, tokenizer) |
| prediction = classifier.predict(embeddings) |
| dict_class = {0: 'Good', 1: 'Neutral', 2: 'Bad'} |
|
|
| return dict_class[prediction[0]] |