| from transformers import PreTrainedModel, PretrainedConfig |
| from tensorflow.keras.models import load_model |
| from tensorflow.keras.preprocessing.text import tokenizer_from_json |
| from tensorflow.keras.preprocessing.sequence import pad_sequences |
| import numpy as np |
| import json |
|
|
| class NewsClassifierConfig(PretrainedConfig): |
| model_type = "custom" |
| |
| def __init__( |
| self, |
| max_length=41, |
| vocab_size=74934, |
| embedding_dim=128, |
| hidden_size=64, |
| num_labels=2, |
| **kwargs |
| ): |
| self.max_length = max_length |
| self.vocab_size = vocab_size |
| self.embedding_dim = embedding_dim |
| self.hidden_size = hidden_size |
| self.num_labels = num_labels |
| super().__init__(**kwargs) |
|
|
| class NewsClassifier(PreTrainedModel): |
| config_class = NewsClassifierConfig |
| base_model_prefix = "custom" |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.model = None |
| self.tokenizer = None |
| |
| def post_init(self): |
| """Load model and tokenizer after initialization""" |
| self.model = load_model('custom.h5') |
| with open('tokenizer.json', 'r') as f: |
| tokenizer_data = json.load(f) |
| self.tokenizer = tokenizer_from_json(tokenizer_data) |
|
|
| def forward(self, text_input): |
| if not self.model or not self.tokenizer: |
| self.post_init() |
| |
| if isinstance(text_input, str): |
| text_input = [text_input] |
| |
| sequences = self.tokenizer.texts_to_sequences(text_input) |
| padded = pad_sequences(sequences, maxlen=self.config.max_length) |
| predictions = self.model.predict(padded, verbose=0) |
| |
| results = [] |
| for pred in predictions: |
| |
| score = float(pred[1]) |
| label = "foxnews" if score > 0.5 else "nbc" |
| results.append({ |
| "label": label, |
| "score": score if label == "foxnews" else 1 - score |
| }) |
| |
| return results[0] if len(text_input) == 1 else results |