Spaces:

theformatisvalid
/

text_classificators

Build error

App Files Files Community

theformatisvalid commited on Dec 20, 2025

Commit

2153792

verified ·

1 Parent(s): 37958f4

Upload 7 files

Browse files

Files changed (7) hide show

src/classical_classifiers.py +221 -0
src/imbalance_handling.py +219 -0
src/main.py +544 -0
src/model_evaluation.py +308 -0
src/model_interpretation.py +320 -0
src/neural_classifiers.py +286 -0
src/text_preprocessing.py +277 -0

src/classical_classifiers.py ADDED Viewed

	@@ -0,0 +1,221 @@

+from typing import Dict, Any, Optional, Union, Tuple
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report
+from sklearn.preprocessing import LabelEncoder
+XGBClassifier = None
+CatBoostClassifier = None
+LGBMClassifier = None
+try:
+    from xgboost import XGBClassifier
+except ImportError:
+    pass
+try:
+    from catboost import CatBoostClassifier
+except ImportError:
+    pass
+try:
+    from lightgbm import LGBMClassifier
+except ImportError:
+    pass
+def get_logistic_regression(
+    penalty: str = "l2",
+    C: float = 1.0,
+    max_iter: int = 1000,
+    solver: str = "liblinear",  # supports l1
+    random_state: int = 42
+) -> LogisticRegression:
+    if penalty not in ("l1", "l2", "elasticnet", "none"):
+        raise ValueError("penalty must be 'l1', 'l2', 'elasticnet', or 'none'")
+    if penalty == "l1" and solver not in ("liblinear", "saga"):
+        solver = "liblinear"
+    return LogisticRegression(
+        penalty=penalty,
+        C=C,
+        max_iter=max_iter,
+        solver=solver,
+        random_state=random_state
+    )
+def get_svm_linear(C: float = 1.0, random_state: int = 42) -> SVC:
+    return SVC(kernel="linear", C=C, probability=True, random_state=random_state)
+def get_random_forest(
+    n_estimators: int = 100,
+    max_depth: Optional[int] = None,
+    random_state: int = 42
+) -> RandomForestClassifier:
+    return RandomForestClassifier(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        random_state=random_state
+    )
+def get_gradient_boosting(
+    model_type: str = "xgb",
+    **kwargs
+) -> Union[XGBClassifier, "CatBoostClassifier", "LGBMClassifier"]:
+    if model_type == "xgb":
+        if XGBClassifier is None:
+            raise ImportError("XGBoost not installed. Run: pip install xgboost")
+        kwargs.setdefault("random_state", 42)
+        return XGBClassifier(**kwargs)
+    elif model_type == "cat":
+        if CatBoostClassifier is None:
+            raise ImportError("CatBoost not installed. Run: pip install catboost")
+        kwargs.setdefault("verbose", False)
+        kwargs.setdefault("random_seed", 42)
+        return CatBoostClassifier(**kwargs)
+    elif model_type == "lgb":
+        if LGBMClassifier is None:
+            raise ImportError("LightGBM not installed. Run: pip install lightgbm")
+        kwargs.setdefault("random_state", 42)
+        return LGBMClassifier(**kwargs)
+    else:
+        raise ValueError("model_type must be 'xgb', 'cat', or 'lgb'")
+def get_bagging_classifier(
+    base_estimator: str = "tree",
+    n_estimators: int = 10,
+    random_state: int = 42
+) -> BaggingClassifier:
+    if base_estimator == "tree":
+        from sklearn.tree import DecisionTreeClassifier
+        estimator = DecisionTreeClassifier(random_state=random_state)
+    elif base_estimator == "lr":
+        estimator = get_logistic_regression()
+    else:
+        raise ValueError("base_estimator must be 'tree' or 'lr'")
+    return BaggingClassifier(
+        estimator=estimator,
+        n_estimators=n_estimators,
+        random_state=random_state
+    )
+def get_stacking_classifier(
+    final_estimator: Optional[BaseEstimator] = None,
+    cv: int = 5,
+    random_state: int = 42
+) -> StackingClassifier:
+    estimators = [
+        ("lr", get_logistic_regression()),
+        ("svm", get_svm_linear()),
+    ]
+    if CatBoostClassifier is not None:
+        estimators.append(("cat", get_gradient_boosting("cat", iterations=100)))
+    if final_estimator is None:
+        final_estimator = get_logistic_regression()
+    return StackingClassifier(
+        estimators=estimators,
+        final_estimator=final_estimator,
+        cv=cv,
+        passthrough=False
+    )
+def get_voting_classifier(
+    voting: str = "soft",
+    use_catboost: bool = True
+) -> VotingClassifier:
+    clfs = [
+        ("lr", get_logistic_regression()),
+        ("svm", get_svm_linear()),
+        ("rf", get_random_forest(n_estimators=50))
+    ]
+    if use_catboost and CatBoostClassifier is not None:
+        clfs.append(("cat", get_gradient_boosting("cat", iterations=50, verbose=False)))
+    return VotingClassifier(
+        estimators=clfs,
+        voting=voting
+    )
+def tpot_classifier(
+    generations: int = 5,
+    population_size: int = 20,
+    cv: int = 5,
+    random_state: int = 42,
+    verbosity: int = 0
+) -> Any:
+    try:
+        from tpot import TPOTClassifier
+    except ImportError:
+        raise ImportError("TPOT not installed. Run: pip install tpot")
+    return TPOTClassifier(
+        generations=generations,
+        population_size=population_size,
+        cv=cv,
+        random_state=random_state,
+        verbosity=verbosity,
+        n_jobs=-1
+    )
+def h2o_classifier(
+    max_runtime_secs: int = 300,
+    seed: int = 42,
+    exclude_algos: Optional[list] = None
+) -> Any:
+    try:
+        import h2o
+        from h2o.automl import H2OAutoML
+    except ImportError:
+        raise ImportError("H2O not installed. Run: pip install h2o")
+    aml = H2OAutoML(
+        max_runtime_secs=max_runtime_secs,
+        seed=seed,
+        exclude_algos=exclude_algos
+    )
+    return aml
+def train_and_evaluate(
+    model: Union[BaseEstimator, Any],
+    X_train: Union[np.ndarray, pd.DataFrame],
+    y_train: Union[np.ndarray, pd.Series],
+    X_test: Union[np.ndarray, pd.DataFrame],
+    y_test: Union[np.ndarray, pd.Series],
+    is_h2o: bool = False
+) -> Dict[str, Any]:
+    if is_h2o:
+        import h2o
+        train_frame = X_train.cbind(y_train)
+        test_frame = X_test.cbind(y_test)
+        y_col = y_train.columns[0]
+        model.train(x=X_train.columns.tolist(), y=y_col, training_frame=train_frame)
+        perf = model.model_performance(test_frame)
+        return {
+            "accuracy": perf.accuracy()[0],
+            "auc": perf.auc() if perf._has_auc() else None,
+            "best_model": model.leader
+        }
+    else:
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_test)
+        return {
+            "accuracy": accuracy_score(y_test, y_pred),
+            "report": classification_report(y_test, y_pred, output_dict=True),
+            "model": model
+        }

src/imbalance_handling.py ADDED Viewed

	@@ -0,0 +1,219 @@

+from typing import List, Tuple, Union, Dict, Optional, Any, Callable
+import numpy as np
+import pandas as pd
+from collections import Counter
+def compute_class_weights(y: Union[List, np.ndarray], method: str = "balanced") -> Union[Dict[int, float], None]:
+    if method == "balanced":
+        from sklearn.utils.class_weight import compute_class_weight
+        classes = np.unique(y)
+        weights = compute_class_weight('balanced', classes=classes, y=y)
+        return dict(zip(classes, weights))
+    else:
+        return None
+def get_pytorch_weighted_loss(class_weights: Optional[Dict[int, float]] = None,
+                              num_classes: Optional[int] = None) -> 'torch.nn.Module':
+    try:
+        import torch
+        import torch.nn as nn
+    except ImportError:
+        raise ImportError("PyTorch not installed")
+    if class_weights is not None:
+        weight_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights.keys())], dtype=torch.float)
+        return nn.CrossEntropyLoss(weight=weight_tensor)
+    else:
+        return nn.CrossEntropyLoss()
+def get_tensorflow_weighted_loss(class_weights: Optional[Dict[int, float]] = None) -> Callable:
+    if not class_weights:
+        return 'sparse_categorical_crossentropy'
+    weight_list = [class_weights[i] for i in sorted(class_weights.keys())]
+    import tensorflow as tf
+    def weighted_sparse_categorical_crossentropy(y_true, y_pred):
+        y_true = tf.cast(y_true, tf.int32)
+        y_true_one_hot = tf.one_hot(y_true, depth=len(weight_list))
+        weights = tf.reduce_sum(y_true_one_hot * weight_list, axis=1)
+        unweighted_losses = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
+        weighted_losses = unweighted_losses * weights
+        return tf.reduce_mean(weighted_losses)
+    return weighted_sparse_categorical_crossentropy
+def apply_sampling(
+        X: np.ndarray,
+        y: np.ndarray,
+        method: str = "random_under",
+        random_state: int = 42
+) -> Tuple[np.ndarray, np.ndarray]:
+    from imblearn.over_sampling import SMOTE, ADASYN
+    from imblearn.under_sampling import RandomUnderSampler
+    from imblearn.over_sampling import RandomOverSampler
+    if method == "random_under":
+        sampler = RandomUnderSampler(random_state=random_state)
+    elif method == "random_over":
+        sampler = RandomOverSampler(random_state=random_state)
+    elif method == "smote":
+        sampler = SMOTE(random_state=random_state)
+    elif method == "adasyn":
+        sampler = ADASYN(random_state=random_state)
+    else:
+        raise ValueError("method must be one of: random_under, random_over, smote, adasyn")
+    X_res, y_res = sampler.fit_resample(X, y)
+    return X_res, y_res
+def augment_texts(
+        texts: List[str],
+        labels: List[Any],
+        augmentation_type: str = "synonym",
+        aug_p: float = 0.1,
+        lang: str = "ru",  # language code
+        model_name: Optional[str] = None,
+        num_aug: int = 1,
+        random_state: int = 42
+) -> Tuple[List[str], List[Any]]:
+    try:
+        import nlpaug.augmenter.word as naw
+        import nlpaug.augmenter.sentence as nas
+    except ImportError:
+        raise ImportError("Install nlpaug: pip install nlpaug")
+    augmented_texts = []
+    augmented_labels = []
+    if augmentation_type == "synonym":
+        if lang == "en":
+            aug = naw.SynonymAug(aug_p=aug_p, aug_max=None)
+        else:
+            aug = naw.ContextualWordEmbsAug(
+                model_path='bert-base-multilingual-cased',
+                action="substitute",
+                aug_p=aug_p,
+                device='cpu'
+            )
+    elif augmentation_type == "insert":
+        aug = naw.RandomWordAug(action="insert", aug_p=aug_p)
+    elif augmentation_type == "delete":
+        aug = naw.RandomWordAug(action="delete", aug_p=aug_p)
+    elif augmentation_type == "swap":
+        aug = naw.RandomWordAug(action="swap", aug_p=aug_p)
+    elif augmentation_type == "eda":
+        aug = naw.AntonymAug()
+    elif augmentation_type == "back_trans":
+        if not model_name:
+            if lang == "ru":
+                model_name = "Helsinki-NLP/opus-mt-ru-en"
+                back_model = "Helsinki-NLP/opus-mt-en-ru"
+            else:
+                model_name = "Helsinki-NLP/opus-mt-en-ru"
+                back_model = "Helsinki-NLP/opus-mt-ru-en"
+        else:
+            back_model = model_name
+        try:
+            from transformers import pipeline
+            translator1 = pipeline("translation", model=model_name, tokenizer=model_name)
+            translator2 = pipeline("translation", model=back_model, tokenizer=back_model)
+            def back_translate(text):
+                try:
+                    trans = translator1(text)[0]['translation_text']
+                    back = translator2(trans)[0]['translation_text']
+                    return back
+                except Exception:
+                    return text
+            augmented = [back_translate(t) for t in texts for _ in range(num_aug)]
+            labels_aug = [l for l in labels for _ in range(num_aug)]
+            return augmented, labels_aug
+        except Exception as e:
+            print(f"Back-translation failed: {e}. Falling back to synonym augmentation.")
+            aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', aug_p=aug_p)
+    elif augmentation_type == "llm":
+        raise NotImplementedError("LLM-controlled augmentation requires external API (e.g., OpenAI, YandexGPT)")
+    else:
+        raise ValueError("Unknown augmentation_type")
+    for text, label in zip(texts, labels):
+        for _ in range(num_aug):
+            try:
+                aug_text = aug.augment(text)
+                if isinstance(aug_text, list):
+                    aug_text = aug_text[0]
+                augmented_texts.append(aug_text)
+                augmented_labels.append(label)
+            except Exception as e:
+                augmented_texts.append(text)
+                augmented_labels.append(label)
+    return augmented_texts, augmented_labels
+def balance_text_dataset(
+        texts: List[str],
+        labels: List[Any],
+        strategy: str = "augmentation",
+        minority_classes: Optional[List[Any]] = None,
+        augmentation_type: str = "synonym",
+        sampling_method: str = "smote",
+        lang: str = "ru",
+        embedding_func: Optional[Callable] = None,
+        class_weights: bool = False,
+        random_state: int = 42
+) -> Union[
+    Tuple[List[str], List[Any]],  # for augmentation
+    Tuple[np.ndarray, np.ndarray, Optional[Dict]]  # for sampling + weights
+]:
+    label_counts = Counter(labels)
+    if minority_classes is None:
+        min_count = min(label_counts.values())
+        minority_classes = [lbl for lbl, cnt in label_counts.items() if cnt == min_count]
+    if strategy == "augmentation":
+        minority_texts = [t for t, l in zip(texts, labels) if l in minority_classes]
+        minority_labels = [l for l in labels if l in minority_classes]
+        aug_texts, aug_labels = augment_texts(
+            minority_texts, minority_labels,
+            augmentation_type=augmentation_type,
+            lang=lang,
+            num_aug=max(1, int((max(label_counts.values()) / min_count)) - 1),
+            random_state=random_state
+        )
+        balanced_texts = texts + aug_texts
+        balanced_labels = labels + aug_labels
+        return balanced_texts, balanced_labels
+    elif strategy == "sampling":
+        if embedding_func is None:
+            raise ValueError("embedding_func is required for sampling strategy")
+        X_embed = np.array([embedding_func(t) for t in texts])
+        X_res, y_res = apply_sampling(X_embed, np.array(labels), method=sampling_method, random_state=random_state)
+        weights = compute_class_weights(y_res) if class_weights else None
+        return X_res, y_res, weights
+    elif strategy == "both":
+        aug_texts, aug_labels = balance_text_dataset(
+            texts, labels, strategy="augmentation", minority_classes=minority_classes,
+            augmentation_type=augmentation_type, lang=lang, random_state=random_state
+        )
+        if embedding_func is None:
+            return aug_texts, aug_labels
+        X_embed = np.array([embedding_func(t) for t in aug_texts])
+        X_res, y_res = apply_sampling(X_embed, np.array(aug_labels), method=sampling_method, random_state=random_state)
+        weights = compute_class_weights(y_res) if class_weights else None
+        return X_res, y_res, weights
+    else:
+        raise ValueError("strategy must be 'augmentation', 'sampling', or 'both'")

src/main.py ADDED Viewed

	@@ -0,0 +1,544 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import List, Dict, Any, Union
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import shap
+st.set_page_config(
+    page_title="Text Classifiers",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+from text_preprocessing import (
+    preprocess_text, get_contextual_embeddings, TextVectorizer
+)
+from classical_classifiers import (
+    get_logistic_regression, get_svm_linear, get_random_forest,
+    get_gradient_boosting, get_voting_classifier
+)
+from neural_classifiers import get_transformer_classifier
+from model_evaluation import evaluate_model
+from model_interpretation import (
+    get_linear_feature_importance,
+    analyze_errors,
+    get_transformer_attention,
+    visualize_attention_weights,
+    get_token_importance_captum,
+    plot_token_importance
+)
+import warnings
+warnings.filterwarnings("ignore")
+if 'models' not in st.session_state:
+    st.session_state.models = {}
+if 'results' not in st.session_state:
+    st.session_state.results = {}
+if 'dataset' not in st.session_state:
+    st.session_state.dataset = None
+if 'task_type' not in st.session_state:
+    st.session_state.task_type = None
+if 'preprocessed' not in st.session_state:
+    st.session_state.preprocessed = None
+if 'X' not in st.session_state:
+    st.session_state.X = None
+if 'y' not in st.session_state:
+    st.session_state.y = None
+if 'feature_names' not in st.session_state:
+    st.session_state.feature_names = None
+if 'vectorizer' not in st.session_state:
+    st.session_state.vectorizer = None
+if 'vectorizer_type' not in st.session_state:
+    st.session_state.vectorizer_type = None
+if 'X_test' not in st.session_state:
+    st.session_state.X_test = None
+if 'y_test' not in st.session_state:
+    st.session_state.y_test = None
+if 'test_texts' not in st.session_state:
+    st.session_state.test_texts = None
+if 'label_encoder' not in st.session_state:
+    st.session_state.label_encoder = None
+if 'rubert_model' not in st.session_state:
+    st.session_state.rubert_model = None
+if 'rubert_tokenizer' not in st.session_state:
+    st.session_state.rubert_tokenizer = None
+if 'rubert_trained' not in st.session_state:
+    st.session_state.rubert_trained = False
+st.sidebar.title("Setup")
+st.sidebar.subheader("1. Upload Dataset (JSONL)")
+uploaded_file = st.sidebar.file_uploader("Upload .jsonl file", type=["jsonl"])
+if uploaded_file:
+    try:
+        raw_data = []
+        lines = uploaded_file.getvalue().decode("utf-8").splitlines()
+        for line in lines:
+            if line.strip():
+                raw_data.append(json.loads(line))
+        st.session_state.dataset = raw_data
+        first = raw_data[0]
+        if 'sentiment' in first:
+            st.session_state.task_type = "binary"
+            labels = [item['sentiment'] for item in raw_data]
+        elif 'category' in first:
+            st.session_state.task_type = "multiclass"
+            labels = [item['category'] for item in raw_data]
+        elif 'tags' in first:
+            st.session_state.task_type = "multilabel"
+            labels = [item['tags'] for item in raw_data]
+        else:
+            st.sidebar.error("No label field found")
+            st.session_state.task_type = None
+            st.session_state.dataset = None
+        if st.session_state.task_type:
+            st.sidebar.success(f"Loaded {len(raw_data)} samples. Task: {st.session_state.task_type}")
+        if st.session_state.task_type == "binary":
+            id2label = {0: "Negative", 1: "Positive"}
+            label2id = {"Negative": 0, "Positive": 1}
+        elif st.session_state.task_type == "multiclass":
+            id2label = {0: "Политика", 1: "Экономика", 2: "Спорт", 3: "Культура"}
+            label2id = {"Политика": 0, "Экономика": 1, "Спорт": 2, "Культура": 3}
+        else:
+            id2label = None
+            label2id = None
+        st.session_state.id2label = id2label
+        st.session_state.label2id = label2id
+    except Exception as e:
+        st.sidebar.error(f"Failed to parse JSONL: {e}")
+        st.session_state.dataset = None
+if st.session_state.dataset is not None:
+    st.sidebar.subheader("2. Preprocess Text")
+    lang = st.sidebar.selectbox("Language", ["ru", "en"], index=0)
+    st.session_state.preprocess_lang = 'ru'
+    if st.sidebar.button("Run Preprocessing"):
+        with st.spinner("Preprocessing..."):
+            texts = [item['text'] for item in st.session_state.dataset]
+            preprocessed = [preprocess_text(text, lang='ru', remove_stopwords=False) for text in texts]
+            st.session_state.preprocessed = preprocessed
+        st.sidebar.success("Preprocessing done!")
+if st.session_state.preprocessed is not None:
+    st.sidebar.subheader("3. Vectorization (Classical)")
+    vectorizer_type = st.sidebar.selectbox("Method", ["TF-IDF", "RuBERT Embeddings"])
+    if st.sidebar.button("Vectorize"):
+        with st.spinner("Vectorizing..."):
+            if vectorizer_type == "TF-IDF":
+                vectorizer = TextVectorizer()
+                if not isinstance(st.session_state.preprocessed[0], str):
+                    st.session_state.preprocessed = [
+                        ' '.join(text) for text in st.session_state.preprocessed
+                    ]
+                st.sidebar.write("Using max_features=5000")
+                X = vectorizer.tfidf(st.session_state.preprocessed, max_features=5000)
+                st.sidebar.write(f"X shape: {X.shape}")
+                st.session_state.vectorizer = vectorizer
+                st.session_state.feature_names = vectorizer.tfidf_vectorizer.get_feature_names_out()
+            else:
+                X = []
+                for text in st.session_state.preprocessed:
+                    emb = get_contextual_embeddings([text], model_name="DeepPavlov/rubert-base-cased")
+                    X.append(emb[0])
+                X = np.array(X)
+                st.session_state.vectorizer = None
+                st.session_state.feature_names = None
+            st.session_state.X = X
+            st.session_state.vectorizer_type = vectorizer_type
+            if st.session_state.task_type == "binary":
+                y = np.array([item['sentiment'] for item in st.session_state.dataset])
+            elif st.session_state.task_type == "multiclass":
+                y = np.array([item['category'] for item in st.session_state.dataset])
+            else:
+                y = [item['tags'] for item in st.session_state.dataset]
+            st.session_state.y = y
+        st.sidebar.success("Vectorization complete!")
+if st.session_state.X is not None:
+    st.sidebar.subheader("4. Train Classical Models")
+    model_options = ["Logistic Regression", "SVM", "Random Forest", "XGBoost", "Voting"]
+    selected_models = st.sidebar.multiselect("Models", model_options)
+    if st.sidebar.button("Train Classical Models"):
+        from sklearn.model_selection import train_test_split
+        from sklearn.preprocessing import LabelEncoder
+        X = st.session_state.X
+        y = st.session_state.y
+        if st.session_state.task_type == "multiclass":
+            le = LabelEncoder()
+            y_encoded = le.fit_transform(y)
+            st.session_state.label_encoder = le
+            y_for_split = y_encoded
+        else:
+            y_for_split = y if st.session_state.task_type == "binary" else np.array([len(tags) for tags in y])
+        if st.session_state.task_type == "multilabel":
+            split_idx = int(0.8 * len(X))
+            X_train, X_test = X[:split_idx], X[split_idx:]
+            y_train, y_test = y[:split_idx], y[split_idx:]
+            test_texts = [item['text'] for item in st.session_state.dataset[split_idx:]]
+        else:
+            indices = np.arange(len(X))
+            X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
+                X, y_for_split, indices, test_size=0.2,
+                stratify=y_for_split if st.session_state.task_type != "multilabel" else None,
+                random_state=42
+            )
+            test_texts = [st.session_state.dataset[i]['text'] for i in idx_test]
+            if st.session_state.task_type == "multiclass":
+                y_train = le.inverse_transform(y_train)
+                y_test = le.inverse_transform(y_test)
+        st.session_state.X_test = X_test
+        st.session_state.y_test = y_test
+        st.session_state.test_texts = test_texts
+        for name in selected_models:
+            try:
+                with st.spinner(f"Training {name}..."):
+                    if name == "Logistic Regression":
+                        model = get_logistic_regression()
+                        model.fit(X_train, y_train)
+                        st.session_state.models[name] = model
+                    elif name == "SVM":
+                        model = get_svm_linear()
+                        model.fit(X_train, y_train)
+                        st.session_state.models[name] = model
+                    elif name == "Random Forest":
+                        model = get_random_forest()
+                        model.fit(X_train, y_train)
+                        st.session_state.models[name] = model
+                    elif name == "XGBoost":
+                        model = get_gradient_boosting("xgb", n_estimators=100)
+                        model.fit(X_train, y_train)
+                        st.session_state.models[name] = model
+                    elif name == "Voting":
+                        model = get_voting_classifier()
+                        model.fit(X_train, y_train)
+                        st.session_state.models[name] = model
+                if st.session_state.task_type != "multilabel":
+                    metrics = evaluate_model(model, X_test, y_test)
+                    st.session_state.results[name] = metrics
+            except Exception as e:
+                st.sidebar.error(f"Failed to train {name}: {e}")
+                continue
+        st.sidebar.success("Classical models trained!")
+if st.session_state.dataset is not None and st.session_state.task_type in ["binary", "multiclass"]:
+    st.sidebar.subheader("5. Train RuBERT (Transformer)")
+    if st.sidebar.button("Train RuBERT"):
+        with st.spinner("Loading RuBERT..."):
+            try:
+                from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
+                num_labels = 2 if st.session_state.task_type == "binary" else 4
+                model_name = "DeepPavlov/rubert-base-cased"
+                config = AutoConfig.from_pretrained(
+                    model_name,
+                    num_labels=num_labels,
+                    id2label=st.session_state.id2label,
+                    label2id=st.session_state.label2id
+                )
+                tokenizer = AutoTokenizer.from_pretrained(model_name)
+                model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
+                st.session_state.rubert_model = model
+                st.session_state.rubert_tokenizer = tokenizer
+                st.session_state.rubert_trained = True
+                st.sidebar.success("RuBERT loaded with correct labels!")
+            except Exception as e:
+                st.sidebar.error(f"RuBERT loading failed: {e}")
+                st.exception(e)
+st.title("Text Classifiers")
+tab1, tab2, tab3, tab4 = st.tabs([
+    "Classify",
+    "Interpret",
+    "Compare",
+    "Error Analysis"
+])
+with tab1:
+    st.subheader("Classify New Text")
+    input_text = st.text_area("Enter text", "Сегодня прошёл важный матч по хоккею.")
+    if st.button("Classify"):
+        cols = st.columns(2)
+        with cols[0]:
+            st.markdown("### Classical Models")
+            if not st.session_state.models:
+                st.info("No classical models trained")
+            else:
+                tokens = preprocess_text(input_text, lang='ru', remove_stopwords=False)
+                preprocessed = " ".join(tokens)
+                if st.session_state.vectorizer_type == "TF-IDF":
+                    X_input = st.session_state.vectorizer.tfidf_vectorizer.transform([preprocessed]).toarray()
+                else:
+                    X_input = get_contextual_embeddings([preprocessed], model_name="DeepPavlov/rubert-base-cased")
+                for name, model in st.session_state.models.items():
+                    pred = model.predict(X_input)[0]
+                    st.write(f"**{name}**: {pred}")
+                    if hasattr(model, "predict_proba"):
+                        proba = model.predict_proba(X_input)[0]
+                        st.write(f"Probabilities: {dict(zip(model.classes_, proba))}")
+        with cols[1]:
+            st.markdown("### RuBERT")
+            if not st.session_state.rubert_trained:
+                st.info("Train RuBERT in sidebar")
+            else:
+                try:
+                    from transformers import pipeline
+                    pipe = pipeline(
+                        "text-classification",
+                        model=st.session_state.rubert_model,
+                        tokenizer=st.session_state.rubert_tokenizer,
+                        device=-1
+                    )
+                    result = pipe(input_text)
+                    label = result[0]['label']
+                    confidence = result[0]['score']
+                    if label.startswith("LABEL_") and st.session_state.id2label:
+                        label_id = int(label.replace("LABEL_", ""))
+                        readable_label = st.session_state.id2label.get(label_id, label)
+                    else:
+                        readable_label = label
+                    st.write(f"**Prediction**: {readable_label}")
+                    st.write(f"**Confidence**: {confidence:.3f}")
+                except Exception as e:
+                    st.error(f"RuBERT inference failed: {e}")
+with tab2:
+    subtab1, subtab2, subtab3 = st.tabs(["SHAP / LIME", "Attention Map", "Captum Heatmap"])
+    with subtab1:
+        st.subheader("SHAP: Local Explanation for One Text")
+        if not st.session_state.models:
+            st.info("Train a classical model first")
+        else:
+            model_name = st.selectbox("Model", list(st.session_state.models.keys()), key="shap_model")
+            text_for_explain = st.text_area("Text to explain", "Прекрасная новость о росте экономики!", key="shap_text")
+            top_k = st.slider("Top features to show", 5, 30, 15)
+            if st.button("Explain with SHAP"):
+                try:
+                    import shap
+                    model = st.session_state.models[model_name]
+                    tokens = preprocess_text(text_for_explain, lang='ru', remove_stopwords=False)
+                    preprocessed = " ".join(tokens)
+                    if st.session_state.vectorizer_type == "TF-IDF":
+                        X_input = st.session_state.vectorizer.tfidf_vectorizer.transform([preprocessed]).toarray()
+                        feature_names = st.session_state.feature_names
+                    else:
+                        X_input = get_contextual_embeddings([preprocessed], model_name="DeepPavlov/rubert-base-cased")
+                        feature_names = [f"emb_{i}" for i in range(X_input.shape[1])]
+                    background = st.session_state.X[:100]
+                    # st.write(f"DEBUG: st.session_state.X shape = {st.session_state.X.shape}")
+                    # st.write(f"DEBUG: X_input shape = {X_input.shape}")
+                    # st.write(f'DEBUG: background shape = {background.shape}')
+                    if "tree" in str(type(model)).lower():
+                        explainer = shap.TreeExplainer(model)
+                        shap_values = explainer.shap_values(X_input)
+                    else:
+                        explainer = shap.KernelExplainer(model.predict_proba, background)
+                        shap_values = explainer.shap_values(X_input, nsamples=200)
+                    if isinstance(shap_values, list):
+                        probs = model.predict_proba(X_input)[0]
+                        target_class = int(np.argmax(probs))
+                        single_shap = shap_values[target_class][0]
+                        expected_val = explainer.expected_value[target_class]
+                    else:
+                        sv = shap_values
+                        if sv.ndim == 1:
+                            single_shap = sv
+                            expected_val = explainer.expected_value
+                        elif sv.ndim == 2:
+                            if sv.shape[0] == 1:
+                                single_shap = sv[0]
+                                expected_val = explainer.expected_value
+                            elif sv.shape[1] == X_input.shape[1]:
+                                probs = model.predict_proba(X_input)[0]
+                                target_class = int(np.argmax(probs))
+                                single_shap = sv[:, target_class]
+                                expected_val = explainer.expected_value[target_class] if isinstance(
+                                    explainer.expected_value, (list, np.ndarray)) else explainer.expected_value
+                            else:
+                                single_shap = sv[0]
+                                expected_val = explainer.expected_value
+                        elif sv.ndim == 3:
+                            if sv.shape[0] != 1:
+                                raise ValueError("SHAP explanation for more than one sample not supported")
+                            probs = model.predict_proba(X_input)[0]
+                            target_class = int(np.argmax(probs))
+                            single_shap = sv[0, :, target_class]
+                            if isinstance(explainer.expected_value, (list, np.ndarray)) and len(
+                                    explainer.expected_value) == sv.shape[2]:
+                                expected_val = explainer.expected_value[target_class]
+                            else:
+                                expected_val = explainer.expected_value
+                        else:
+                            raise ValueError(f"Unsupported SHAP shape: {sv.shape}")
+                    single_shap = np.array(single_shap).flatten()
+                    if single_shap.shape[0] != X_input.shape[1]:
+                        raise ValueError(
+                            f"SHAP vector length {single_shap.shape[0]} != input features {X_input.shape[1]}")
+                    if st.session_state.vectorizer_type == "TF-IDF":
+                        text_vector = X_input[0]
+                        nonzero_indices = np.where(text_vector != 0)[0]
+                        if len(nonzero_indices) == 0:
+                            st.warning("No known words from training vocabulary found in this text.")
+                        else:
+                            filtered_shap = single_shap[nonzero_indices]
+                            filtered_features = text_vector[nonzero_indices]
+                            filtered_names = [st.session_state.feature_names[i] for i in nonzero_indices]
+                            explanation = shap.Explanation(
+                                values=filtered_shap,
+                                base_values=expected_val,
+                                data=filtered_features,
+                                feature_names=filtered_names
+                            )
+                            plt.figure(figsize=(10, min(8, top_k * 0.3)))
+                            shap.plots.waterfall(explanation, max_display=top_k, show=False)
+                            st.pyplot(plt.gcf())
+                            plt.close()
+                    else:
+                        explanation = shap.Explanation(
+                            values=single_shap,
+                            base_values=expected_val,
+                            data=X_input[0],
+                            feature_names=feature_names
+                        )
+                        plt.figure(figsize=(10, min(8, top_k * 0.3)))
+                        shap.plots.waterfall(explanation, max_display=top_k, show=False)
+                        st.pyplot(plt.gcf())
+                        plt.close()
+                except Exception as e:
+                    st.error(f"SHAP error: {e}")
+                    st.exception(e)
+    with subtab2:
+        st.subheader("Transformer Attention Map")
+        if not st.session_state.rubert_trained:
+            st.info("Train RuBERT first")
+        else:
+            text_att = st.text_area("Text for attention", "Матч завершился победой ЦСКА", key="att_text")
+            layer = st.slider("Layer", 0, 11, 6)
+            head = st.slider("Head", 0, 11, 0)
+            if st.button("Visualize Attention"):
+                try:
+                    tokens, attn = get_transformer_attention(
+                        st.session_state.rubert_model,
+                        st.session_state.rubert_tokenizer,
+                        text_att,
+                        device="cpu"
+                    )
+                    weights = attn[layer, head, :len(tokens), :len(tokens)]
+                    fig, ax = plt.subplots(figsize=(10, 4))
+                    sns.heatmap(
+                        weights,
+                        xticklabels=tokens,
+                        yticklabels=tokens,
+                        cmap="viridis",
+                        ax=ax
+                    )
+                    plt.xticks(rotation=45, ha="right")
+                    plt.yticks(rotation=0)
+                    plt.title(f"Attention: Layer {layer}, Head {head}")
+                    st.pyplot(fig)
+                    plt.close(fig)
+                except Exception as e:
+                    st.error(f"Attention failed: {e}")
+                    st.exception(e)
+    with subtab3:
+        st.subheader("Token Importance (Captum)")
+        if not st.session_state.rubert_trained:
+            st.info("Train RuBERT first")
+        else:
+            text_captum = st.text_area("Text for Captum", "Это очень плохая новость для политики", key="captum_text")
+            method = "IntegratedGradients"
+            if st.button("Compute Token Importance"):
+                try:
+                    tokens, importance = get_token_importance_captum(
+                        st.session_state.rubert_model,
+                        st.session_state.rubert_tokenizer,
+                        text_captum,
+                        device="cpu"
+                    )
+                    valid = [(t, imp) for t, imp in zip(tokens, importance) if t not in ["[CLS]", "[SEP]", "[PAD]"]]
+                    if valid:
+                        tokens_clean, imp_clean = zip(*valid)
+                        indices = np.argsort(np.abs(imp_clean))[-15:][::-1]
+                        tokens_top = [tokens_clean[i] for i in indices]
+                        imp_top = [imp_clean[i] for i in indices]
+                        fig, ax = plt.subplots(figsize=(8, 6))
+                        colors = ["red" if x < 0 else "green" for x in imp_top]
+                        ax.barh(range(len(imp_top)), imp_top, color=colors)
+                        ax.set_yticks(range(len(imp_top)))
+                        ax.set_yticklabels(tokens_top)
+                        ax.invert_yaxis()
+                        ax.set_xlabel("Attribution Score")
+                        ax.set_title("Token Importance")
+                        st.pyplot(fig)
+                        plt.close(fig)
+                    else:
+                        st.warning("No valid tokens")
+                except Exception as e:
+                    st.error(f"Captum failed: {e}")
+                    st.exception(e)
+with tab3:
+    st.subheader("Model Comparison")
+    if st.session_state.results:
+        df = pd.DataFrame(st.session_state.results).T
+        st.dataframe(df)
+    else:
+        st.info("Train models to see metrics")
+with tab4:
+    st.subheader("Error Analysis")
+    if st.session_state.X_test is None:
+        st.info("Train models first")
+    else:
+        model_name = st.selectbox("Model for error analysis", list(st.session_state.models.keys()), key="err_model")
+        if st.button("Analyze Errors"):
+            model = st.session_state.models[model_name]
+            y_pred = model.predict(st.session_state.X_test)
+            errors = analyze_errors(
+                st.session_state.y_test,
+                y_pred,
+                st.session_state.test_texts
+            )
+            st.dataframe(errors[['text', 'true_label', 'pred_label']].head(20))

src/model_evaluation.py ADDED Viewed

	@@ -0,0 +1,308 @@

+from typing import Dict, Any, Union, Callable, Optional, Tuple, List
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+import torch
+from sklearn.model_selection import (
+    StratifiedKFold, GroupKFold, TimeSeriesSplit,
+    GridSearchCV, RandomizedSearchCV
+)
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    roc_auc_score, average_precision_score, log_loss,
+    confusion_matrix, classification_report
+)
+from sklearn.base import BaseEstimator
+import warnings
+warnings.filterwarnings("ignore")
+OPTUNA_AVAILABLE = False
+HYPEROPT_AVAILABLE = False
+try:
+    import optuna
+    from optuna.samplers import TPESampler
+    OPTUNA_AVAILABLE = True
+except ImportError:
+    pass
+try:
+    from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
+    HYPEROPT_AVAILABLE = True
+except ImportError:
+    pass
+WANDB_AVAILABLE = False
+try:
+    import wandb
+    WANDB_AVAILABLE = True
+except ImportError:
+    pass
+def get_cv_splitter(
+        cv_type: str = "stratified",
+        n_splits: int = 5,
+        groups: Optional[np.ndarray] = None,
+        random_state: int = 42
+):
+    if cv_type == "stratified":
+        return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
+    elif cv_type == "group":
+        if groups is None:
+            raise ValueError("groups must be provided for GroupKFold")
+        return GroupKFold(n_splits=n_splits)
+    elif cv_type == "time":
+        return TimeSeriesSplit(n_splits=n_splits)
+    else:
+        raise ValueError("cv_type must be 'stratified', 'group', or 'time'")
+def grid_search_cv(
+        model: BaseEstimator,
+        X: np.ndarray,
+        y: np.ndarray,
+        param_grid: Dict[str, List],
+        cv_type: str = "stratified",
+        n_splits: int = 5,
+        scoring: str = "f1_macro",
+        groups: Optional[np.ndarray] = None,
+        verbose: int = 1
+) -> GridSearchCV:
+    cv = get_cv_splitter(cv_type, n_splits, groups)
+    search = GridSearchCV(
+        model, param_grid, cv=cv, scoring=scoring, verbose=verbose, n_jobs=-1
+    )
+    search.fit(X, y)
+    return search
+def random_search_cv(
+        model: BaseEstimator,
+        X: np.ndarray,
+        y: np.ndarray,
+        param_distributions: Dict[str, Any],
+        n_iter: int = 20,
+        cv_type: str = "stratified",
+        n_splits: int = 5,
+        scoring: str = "f1_macro",
+        groups: Optional[np.ndarray] = None,
+        verbose: int = 1
+) -> RandomizedSearchCV:
+    cv = get_cv_splitter(cv_type, n_splits, groups)
+    search = RandomizedSearchCV(
+        model, param_distributions, n_iter=n_iter, cv=cv,
+        scoring=scoring, verbose=verbose, n_jobs=-1, random_state=42
+    )
+    search.fit(X, y)
+    return search
+def _optuna_objective(
+        trial,
+        model_fn: Callable,
+        X: np.ndarray,
+        y: np.ndarray,
+        cv,
+        scoring: str = "f1_macro"
+) -> float:
+    if "logistic" in model_fn.__name__.lower():
+        C = trial.suggest_float("C", 1e-4, 1e2, log=True)
+        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
+        solver = "liblinear" if penalty == "l1" else "lbfgs"
+        model = model_fn(C=C, penalty=penalty, solver=solver)
+    elif "random_forest" in model_fn.__name__.lower():
+        n_estimators = trial.suggest_int("n_estimators", 50, 300)
+        max_depth = trial.suggest_int("max_depth", 3, 20)
+        model = model_fn(n_estimators=n_estimators, max_depth=max_depth)
+    else:
+        model = model_fn(trial)
+    scores = []
+    for train_idx, val_idx in cv.split(X, y):
+        X_train, X_val = X[train_idx], X[val_idx]
+        y_train, y_val = y[train_idx], y[val_idx]
+        model.fit(X_train, y_train)
+        y_pred = model.predict(X_val)
+        if scoring == "f1_macro":
+            score = f1_score(y_val, y_pred, average="macro")
+        elif scoring == "roc_auc":
+            y_proba = model.predict_proba(X_val)[:, 1]
+            score = roc_auc_score(y_val, y_proba)
+        else:
+            raise ValueError(f"Scoring {scoring} not implemented in custom Optuna loop")
+        scores.append(score)
+    return np.mean(scores)
+def optuna_tuning(
+        model_fn: Callable,
+        X: np.ndarray,
+        y: np.ndarray,
+        n_trials: int = 50,
+        cv_type: str = "stratified",
+        n_splits: int = 5,
+        scoring: str = "f1_macro",
+        groups: Optional[np.ndarray] = None,
+        direction: str = "maximize"
+) -> optuna.Study:
+    cv = get_cv_splitter(cv_type, n_splits, groups)
+    study = optuna.create_study(direction=direction, sampler=TPESampler(seed=42))
+    study.optimize(
+        lambda trial: _optuna_objective(trial, model_fn, X, y, cv, scoring),
+        n_trials=n_trials
+    )
+    return study
+def hyperopt_tuning(
+        model_fn: Callable,
+        X: np.ndarray,
+        y: np.ndarray,
+        space: Dict,
+        max_evals: int = 50,
+        cv_type: str = "stratified",
+        n_splits: int = 5,
+        scoring: str = "f1_macro",
+        groups: Optional[np.ndarray] = None
+):
+    cv = get_cv_splitter(cv_type, n_splits, groups)
+    def objective(params):
+        model = model_fn(**params)
+        scores = []
+        for train_idx, val_idx in cv.split(X, y):
+            X_train, X_val = X[train_idx], X[val_idx]
+            y_train, y_val = y[train_idx], y[val_idx]
+            model.fit(X_train, y_train)
+            y_pred = model.predict(X_val)
+            if scoring == "f1_macro":
+                score = f1_score(y_val, y_pred, average="macro")
+            elif scoring == "roc_auc":
+                y_proba = model.predict_proba(X_val)[:, 1]
+                score = roc_auc_score(y_val, y_proba)
+            else:
+                score = -1
+            scores.append(-score)
+        return {'loss': -np.mean(scores), 'status': STATUS_OK}
+    trials = Trials()
+    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
+    return best, trials
+def compute_classification_metrics(
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_proba: Optional[np.ndarray] = None,
+        average: str = "macro"
+) -> Dict[str, float]:
+    metrics = {
+        "accuracy": accuracy_score(y_true, y_pred),
+        "precision": precision_score(y_true, y_pred, average=average, zero_division=0),
+        "recall": recall_score(y_true, y_pred, average=average, zero_division=0),
+        "f1": f1_score(y_true, y_pred, average=average, zero_division=0),
+    }
+    if y_proba is not None:
+        if len(np.unique(y_true)) == 2:
+            metrics["roc_auc"] = roc_auc_score(y_true, y_proba[:, 1])
+            metrics["pr_auc"] = average_precision_score(y_true, y_proba[:, 1])
+            metrics["log_loss"] = log_loss(y_true, y_proba)
+        else:
+            try:
+                metrics["roc_auc"] = roc_auc_score(y_true, y_proba, multi_class="ovr", average=average)
+                metrics["pr_auc"] = average_precision_score(y_true, y_proba, average=average)
+                metrics["log_loss"] = log_loss(y_true, y_proba)
+            except ValueError:
+                metrics["roc_auc"] = np.nan
+                metrics["pr_auc"] = np.nan
+    return metrics
+def evaluate_model(
+        model: BaseEstimator,
+        X_test: np.ndarray,
+        y_test: np.ndarray,
+        average: str = "macro",
+        return_pred: bool = False
+) -> Union[Dict[str, float], Tuple[Dict[str, float], np.ndarray, Optional[np.ndarray]]]:
+    y_pred = model.predict(X_test)
+    y_proba = None
+    if hasattr(model, "predict_proba"):
+        y_proba = model.predict_proba(X_test)
+    metrics = compute_classification_metrics(y_test, y_pred, y_proba, average=average)
+    if return_pred:
+        return metrics, y_pred, y_proba
+    return metrics
+def get_early_stopping(
+        monitor: str = "val_loss",
+        patience: int = 5,
+        mode: str = "min",
+        framework: str = "keras"
+):
+    if framework == "keras":
+        from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
+        es = EarlyStopping(monitor=monitor, patience=patience, restore_best_weights=True, mode=mode)
+        reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.5, patience=3, min_lr=1e-7, mode=mode)
+        return [es, reduce_lr]
+    elif framework == "pytorch":
+        raise NotImplementedError("PyTorch callbacks require custom training loop")
+    else:
+        raise ValueError("framework must be 'keras' or 'pytorch'")
+def init_wandb(
+        project_name: str = "text-classification",
+        run_name: Optional[str] = None,
+        config: Optional[Dict] = None
+):
+    if not WANDB_AVAILABLE:
+        return None
+    wandb.init(project=project_name, name=run_name, config=config)
+    return wandb
+def log_metrics_to_wandb(metrics: Dict[str, float]):
+    if WANDB_AVAILABLE and wandb.run:
+        wandb.log(metrics)
+def suggest_transformer_hparams(trial) -> Dict[str, Any]:
+    return {
+        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+        "per_device_train_batch_size": trial.suggest_categorical("batch_size", [8, 16, 32]),
+        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 6),
+        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
+        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
+    }
+def evaluate_transformer_outputs(
+        y_true: List[int],
+        y_pred: List[int],
+        y_logits: Optional[np.ndarray] = None
+) -> Dict[str, float]:
+    y_true = np.array(y_true)
+    y_pred = np.array(y_pred)
+    if y_logits is not None:
+        y_proba = torch.softmax(torch.tensor(y_logits), dim=-1).numpy()
+    else:
+        y_proba = None
+    return compute_classification_metrics(y_true, y_pred, y_proba, average="macro")
+def confusion_matrix_df(y_true: np.ndarray, y_pred: np.ndarray, labels: Optional[List] = None) -> pd.DataFrame:
+    cm = confusion_matrix(y_true, y_pred, labels=labels)
+    if labels is None:
+        labels = sorted(np.unique(y_true))
+    return pd.DataFrame(cm, index=[f"True_{l}" for l in labels], columns=[f"Pred_{l}" for l in labels])

src/model_interpretation.py ADDED Viewed

	@@ -0,0 +1,320 @@

+from typing import List, Dict, Any, Optional, Union, Callable, Tuple
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict
+import torch
+from sklearn.base import BaseEstimator
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+import warnings
+warnings.filterwarnings("ignore")
+SHAP_AVAILABLE = False
+LIME_AVAILABLE = False
+CAPTUM_AVAILABLE = False
+UMAP_AVAILABLE = False
+try:
+    import shap
+    SHAP_AVAILABLE = True
+except ImportError:
+    pass
+try:
+    import lime
+    import lime.lime_text
+    LIME_AVAILABLE = True
+except ImportError:
+    pass
+try:
+    import captum
+    import captum.attr
+    CAPTUM_AVAILABLE = True
+except ImportError:
+    pass
+try:
+    import umap
+    UMAP_AVAILABLE = True
+except ImportError:
+    pass
+def get_linear_feature_importance(
+    model: BaseEstimator,
+    feature_names: Optional[List[str]] = None,
+    class_index: int = -1
+) -> pd.DataFrame:
+    if hasattr(model, "coef_"):
+        coef = model.coef_
+        if coef.ndim == 1:
+            weights = coef
+        else:
+            if class_index == -1:
+                weights = np.mean(coef, axis=0)
+            else:
+                weights = coef[class_index]
+    else:
+        raise ValueError("Model does not have coef_ attribute")
+    if feature_names is None:
+        feature_names = [f"feature_{i}" for i in range(len(weights))]
+    df = pd.DataFrame({"feature": feature_names, "weight": weights})
+    df = df.sort_values("weight", key=abs, ascending=False).reset_index(drop=True)
+    return df
+def analyze_tfidf_class_keywords(
+    tfidf_matrix: np.ndarray,
+    y: np.ndarray,
+    feature_names: List[str],
+    top_k: int = 20
+) -> Dict[Any, pd.DataFrame]:
+    classes = np.unique(y)
+    results = {}
+    for cls in classes:
+        mask = (y == cls)
+        avg_tfidf = np.mean(tfidf_matrix[mask], axis=0).A1 if hasattr(tfidf_matrix, 'A1') else np.mean(tfidf_matrix[mask], axis=0)
+        top_indices = np.argsort(avg_tfidf)[::-1][:top_k]
+        top_words = [feature_names[i] for i in top_indices]
+        top_scores = [avg_tfidf[i] for i in top_indices]
+        results[cls] = pd.DataFrame({"word": top_words, "tfidf_score": top_scores})
+    return results
+def explain_with_shap(
+    model: BaseEstimator,
+    X_train: np.ndarray,
+    X_test: np.ndarray,
+    feature_names: Optional[List[str]] = None,
+    plot_type: str = "bar",
+    max_display: int = 20
+):
+    if "tree" in str(type(model)).lower():
+        explainer = shap.TreeExplainer(model)
+    else:
+        explainer = shap.KernelExplainer(model.predict_proba, X_train[:100])
+    shap_values = explainer.shap_values(X_test[:100])
+    if feature_names is None:
+        feature_names = [f"feat_{i}" for i in range(X_test.shape[1])]
+    plt.figure(figsize=(10, 6))
+    if isinstance(shap_values, list):
+        shap.summary_plot(shap_values, X_test[:100], feature_names=feature_names, plot_type=plot_type, max_display=max_display, show=False)
+    else:
+        shap.summary_plot(shap_values, X_test[:100], feature_names=feature_names, plot_type=plot_type, max_display=max_display, show=False)
+    plt.tight_layout()
+    plt.show()
+def explain_text_with_lime(
+    model: Any,
+    text: str,
+    tokenizer: Callable,
+    class_names: List[str],
+    num_features: int = 10,
+    num_samples: int = 5000
+):
+    def predict_fn(texts):
+        tokenized = [tokenizer(t) for t in texts]
+        if hasattr(model, "vectorizer"):
+            X = model.vectorizer.transform(texts)
+        else:
+            raise NotImplementedError("Custom predict_fn needed for your pipeline")
+        return model.predict_proba(X.toarray())
+    explainer = lime.lime_text.LimeTextExplainer(class_names=class_names)
+    exp = explainer.explain_instance(text, predict_fn, num_features=num_features, num_samples=num_samples)
+    exp.show_in_notebook()
+def visualize_attention_weights(
+    tokens: List[str],
+    attention_weights: np.ndarray,
+    layer: int = 0,
+    head: int = 0,
+    figsize: Tuple[int, int] = (10, 2)
+):
+    if attention_weights.ndim != 4:
+        raise ValueError("attention_weights must be 4D: (layers, heads, seq, seq)")
+    weights = attention_weights[layer, head, :len(tokens), :len(tokens)]
+    plt.figure(figsize=figsize)
+    sns.heatmap(
+        weights,
+        xticklabels=tokens,
+        yticklabels=tokens,
+        cmap="viridis",
+        cbar=True
+    )
+    plt.title(f"Attention Layer {layer}, Head {head}")
+    plt.xticks(rotation=45, ha="right")
+    plt.yticks(rotation=0)
+    plt.tight_layout()
+    plt.show()
+def get_transformer_attention(
+    model: 'torch.nn.Module',
+    tokenizer: 'transformers.PreTrainedTokenizer',
+    text: str,
+    device: str = "cpu"
+) -> Tuple[List[str], np.ndarray]:
+    if not CAPTUM_AVAILABLE:
+        raise ImportError("Install Captum: pip install captum")
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    input_ids = inputs["input_ids"].to(device)
+    model = model.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids, output_attentions=True)
+        attentions = outputs.attentions
+    attn = torch.stack(attentions, dim=0).squeeze(1).cpu().numpy()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
+    return tokens, attn
+def analyze_errors(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    texts: List[str],
+    labels: Optional[List[Any]] = None
+) -> pd.DataFrame:
+    errors = []
+    for i, (true, pred, text) in enumerate(zip(y_true, y_pred, texts)):
+        if true != pred:
+            errors.append({
+                "index": i,
+                "text": text,
+                "true_label": true,
+                "pred_label": pred
+            })
+    return pd.DataFrame(errors)
+def compare_model_errors(
+    models: Dict[str, BaseEstimator],
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    texts: List[str]
+) -> Dict[str, pd.DataFrame]:
+    results = {}
+    for name, model in models.items():
+        y_pred = model.predict(X_test)
+        errors = analyze_errors(y_test, y_pred, texts)
+        results[name] = errors
+    return results
+def plot_embeddings(
+    embeddings: np.ndarray,
+    labels: np.ndarray,
+    method: str = "umap",
+    n_components: int = 2,
+    figsize: Tuple[int, int] = (12, 8),
+    title: str = "Embedding Projection"
+):
+    if method == "tsne":
+        reducer = TSNE(n_components=n_components, random_state=42, n_jobs=-1)
+    elif method == "umap":
+        if not UMAP_AVAILABLE:
+            raise ImportError("Install UMAP: pip install umap-learn")
+        reducer = umap.UMAP(n_components=n_components, random_state=42, n_jobs=-1)
+    else:
+        raise ValueError("method must be 'tsne' or 'umap'")
+    proj = reducer.fit_transform(embeddings)
+    plt.figure(figsize=figsize)
+    scatter = plt.scatter(proj[:, 0], proj[:, 1], c=labels, cmap="tab10", alpha=0.7)
+    plt.colorbar(scatter)
+    plt.title(title)
+    plt.xlabel("Component 1")
+    plt.ylabel("Component 2")
+    plt.tight_layout()
+    plt.show()
+def get_token_importance_captum(
+    model: 'torch.nn.Module',
+    tokenizer: 'transformers.PreTrainedTokenizer',
+    text: str,
+    device: str = "cpu"
+) -> Tuple[List[str], np.ndarray]:
+    if not CAPTUM_AVAILABLE:
+        raise ImportError("Install Captum: pip install captum")
+    from captum.attr import LayerIntegratedGradients
+    import torch
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512,
+        padding=True
+    )
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    model = model.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        pred_class = torch.argmax(outputs.logits, dim=1).item()
+    def forward_func(input_ids):
+        return model(input_ids=input_ids, attention_mask=attention_mask).logits
+    baseline_ids = torch.zeros_like(input_ids).to(device)
+    baseline_ids[:, 0] = tokenizer.cls_token_id
+    baseline_ids[:, -1] = tokenizer.sep_token_id
+    lig = LayerIntegratedGradients(forward_func, model.bert.embeddings)
+    attributions, delta = lig.attribute(
+        inputs=input_ids,
+        baselines=baseline_ids,
+        target=pred_class,
+        return_convergence_delta=True
+    )
+    attributions = attributions.sum(dim=-1).squeeze(0).cpu().detach().numpy()
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
+    return tokens, attributions
+def plot_token_importance(tokens: List[str], importance: np.ndarray, top_k: int = 20):
+    valid = [(t, imp) for t, imp in zip(tokens, importance) if t not in ["[CLS]", "[SEP]", "[PAD]"]]
+    if not valid:
+        return
+    tokens_clean, imp_clean = zip(*valid)
+    indices = np.argsort(np.abs(imp_clean))[-top_k:][::-1]
+    tokens_top = [tokens_clean[i] for i in indices]
+    imp_top = [imp_clean[i] for i in indices]
+    plt.figure(figsize=(10, 6))
+    colors = ["red" if x < 0 else "green" for x in imp_top]
+    plt.barh(range(len(imp_top)), imp_top, color=colors)
+    plt.yticks(range(len(imp_top)), tokens_top)
+    plt.gca().invert_yaxis()
+    plt.xlabel("Attribution Score")
+    plt.title("Token Importance (Green: positive, Red: negative)")
+    plt.tight_layout()
+    plt.show()

src/neural_classifiers.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import os
+from typing import Optional, Union, Tuple, Dict, Any, Literal
+import numpy as np
+try:
+    import tensorflow as tf
+    from tensorflow.keras import layers, models, optimizers, callbacks
+    from tensorflow.keras.models import Model
+    from tensorflow.keras.layers import (
+        Input, Embedding, Dense, Dropout, GlobalMaxPooling1D,
+        Conv1D, LSTM, GRU, Bidirectional, Attention, GlobalAveragePooling1D
+    )
+    TF_AVAILABLE = True
+except ImportError:
+    TF_AVAILABLE = False
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn.utils.rnn import pad_sequence
+    from transformers import (
+        AutoTokenizer, AutoModel, AutoConfig,
+        BertForSequenceClassification, RobertaForSequenceClassification,
+        DistilBertForSequenceClassification, Trainer, TrainingArguments
+    )
+    from transformers.tokenization_utils_base import BatchEncoding
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+class AttentionLayer(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def build(self, input_shape):
+        self.W = self.add_weight(
+            shape=(input_shape[-1], 1),
+            initializer='random_normal',
+            trainable=True,
+            name='attention_weight'
+        )
+        self.b = self.add_weight(
+            shape=(input_shape[1], 1),
+            initializer='zeros',
+            trainable=True,
+            name='attention_bias'
+        )
+        super().build(input_shape)
+    def call(self, inputs, **kwargs):
+        e = tf.keras.activations.tanh(tf.matmul(inputs, self.W) + self.b)
+        e = tf.squeeze(e, axis=-1)
+        a = tf.nn.softmax(e, axis=1)
+        a = tf.expand_dims(a, axis=-1)
+        weighted_input = inputs * a
+        return tf.reduce_sum(weighted_input, axis=1)
+def build_mlp(
+    input_dim: int,
+    num_classes: int,
+    hidden_dims: list = [256, 128],
+    dropout: float = 0.3,
+    activation: str = 'relu'
+) -> 'tf.keras.Model':
+    if not TF_AVAILABLE:
+        raise ImportError("TensorFlow not available")
+    inputs = Input(shape=(input_dim,))
+    x = inputs
+    for dim in hidden_dims:
+        x = Dense(dim, activation=activation)(x)
+        x = Dropout(dropout)(x)
+    outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
+    return models.Model(inputs, outputs)
+def build_kim_cnn(
+    max_len: int,
+    vocab_size: int,
+    embed_dim: int,
+    num_classes: int,
+    filter_sizes: list = [3, 4, 5],
+    num_filters: int = 100,
+    dropout: float = 0.5,
+    pre_embed_matrix: Optional[np.ndarray] = None
+) -> 'tf.keras.Model':
+    if not TF_AVAILABLE:
+        raise ImportError("TensorFlow not available")
+    inputs = Input(shape=(max_len,))
+    if pre_embed_matrix is not None:
+        embedding = Embedding(
+            vocab_size, embed_dim,
+            weights=[pre_embed_matrix],
+            trainable=False
+        )(inputs)
+    else:
+        embedding = Embedding(vocab_size, embed_dim)(inputs)
+    pooled_outputs = []
+    for fs in filter_sizes:
+        x = Conv1D(num_filters, fs, activation='relu')(embedding)
+        x = GlobalMaxPooling1D()(x)
+        pooled_outputs.append(x)
+    merged = tf.concat(pooled_outputs, axis=1)
+    x = Dropout(dropout)(merged)
+    outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
+    return models.Model(inputs, outputs)
+def build_lstm(
+    max_len: int,
+    vocab_size: int,
+    embed_dim: int,
+    num_classes: int,
+    lstm_units: int = 128,
+    dropout: float = 0.3,
+    bidirectional: bool = False,
+    pre_embed_matrix: Optional[np.ndarray] = None
+) -> 'tf.keras.Model':
+    if not TF_AVAILABLE:
+        raise ImportError("TensorFlow not available")
+    inputs = Input(shape=(max_len,))
+    if pre_embed_matrix is not None:
+        x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
+    else:
+        x = Embedding(vocab_size, embed_dim)(inputs)
+    rnn_layer = LSTM(lstm_units, dropout=dropout, recurrent_dropout=dropout)
+    if bidirectional:
+        x = Bidirectional(rnn_layer)(x)
+    else:
+        x = rnn_layer(x)
+    outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
+    return models.Model(inputs, outputs)
+def build_cnn_lstm(
+    max_len: int,
+    vocab_size: int,
+    embed_dim: int,
+    num_classes: int,
+    filter_size: int = 3,
+    num_filters: int = 128,
+    lstm_units: int = 64,
+    dropout: float = 0.3,
+    pre_embed_matrix: Optional[np.ndarray] = None
+) -> 'tf.keras.Model':
+    if not TF_AVAILABLE:
+        raise ImportError("TensorFlow not available")
+    inputs = Input(shape=(max_len,))
+    if pre_embed_matrix is not None:
+        x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
+    else:
+        x = Embedding(vocab_size, embed_dim)(inputs)
+    x = Conv1D(num_filters, filter_size, activation='relu', padding='same')(x)
+    x = LSTM(lstm_units, dropout=dropout)(x)
+    outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
+    return models.Model(inputs, outputs)
+def build_birnn_attention(
+    max_len: int,
+    vocab_size: int,
+    embed_dim: int,
+    num_classes: int,
+    rnn_units: int = 64,
+    dropout: float = 0.3,
+    pre_embed_matrix: Optional[np.ndarray] = None
+) -> 'tf.keras.Model':
+    if not TF_AVAILABLE:
+        raise ImportError("TensorFlow not available")
+    inputs = Input(shape=(max_len,))
+    if pre_embed_matrix is not None:
+        x = Embedding(vocab_size, embed_dim, weights=[pre_embed_matrix], trainable=False)(inputs)
+    else:
+        x = Embedding(vocab_size, embed_dim)(inputs)
+    x = Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=dropout))(x)
+    x = AttentionLayer()(x)
+    outputs = Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')(x)
+    return models.Model(inputs, outputs)
+_RUSSIAN_TRANSFORMERS = {
+    "rubert": "DeepPavlov/rubert-base-cased",
+    "ruroberta": "sberbank-ai/ruRoberta-large",
+    "distilbert-multilingual": "distilbert-base-multilingual-cased"
+}
+def get_transformer_classifier(
+    model_name: str = "rubert",
+    num_classes: int = 2,
+    problem_type: Literal["single_label", "multi_label"] = "single_label"
+) -> Tuple[Any, Any]:
+    if not TORCH_AVAILABLE:
+        raise ImportError("PyTorch or transformers not available")
+    if model_name not in _RUSSIAN_TRANSFORMERS:
+        raise ValueError(f"Unknown model_name. Choose from: {list(_RUSSIAN_TRANSFORMERS.keys())}")
+    model_id = _RUSSIAN_TRANSFORMERS[model_name]
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if "roberta" in model_id.lower():
+        model = RobertaForSequenceClassification.from_pretrained(
+            model_id, num_labels=num_classes
+        )
+    elif "distilbert" in model_id.lower():
+        model = DistilBertForSequenceClassification.from_pretrained(
+            model_id, num_labels=num_classes
+        )
+    else:
+        model = BertForSequenceClassification.from_pretrained(
+            model_id, num_labels=num_classes
+        )
+    if problem_type == "multi_label":
+        model.config.problem_type = "multi_label_classification"
+    else:
+        model.config.problem_type = "single_label_classification"
+    return model, tokenizer
+def quantize_pytorch_model(model: 'torch.nn.Module', backend: str = "qnnpack") -> 'torch.nn.Module':
+    if not TORCH_AVAILABLE:
+        raise ImportError("PyTorch not available")
+    model.eval()
+    model.qconfig = torch.quantization.get_default_qconfig(backend)
+    torch.quantization.prepare(model, inplace=True)
+    torch.quantization.convert(model, inplace=True)
+    return model
+def prune_keras_model(model: 'tf.keras.Model', sparsity: float = 0.5) -> 'tf.keras.Model':
+    try:
+        import tensorflow_model_optimization as tfmot
+    except ImportError:
+        raise ImportError("Install tensorflow-model-optimization for pruning")
+    pruning_params = {
+        'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(
+            initial_sparsity=0.0, final_sparsity=sparsity, begin_step=0, end_step=1000
+        )
+    }
+    model_for_pruning = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params)
+    return model_for_pruning
+def prepare_keras_inputs(
+    texts: list,
+    tokenizer=None,
+    max_len: int = 128,
+    vocab: Optional[dict] = None
+) -> np.ndarray:
+    if tokenizer is not None:
+        encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="np")
+        return encodings['input_ids']
+    else:
+        from tensorflow.keras.preprocessing.text import Tokenizer
+        from tensorflow.keras.preprocessing.sequence import pad_sequences
+        tk = Tokenizer(oov_token="<OOV>")
+        if vocab:
+            tk.word_index = vocab
+        else:
+            tk.fit_on_texts(texts)
+        sequences = tk.texts_to_sequences(texts)
+        return pad_sequences(sequences, maxlen=max_len)
+def compile_keras_model(
+    model: 'tf.keras.Model',
+    learning_rate: float = 2e-5,
+    num_classes: int = 2
+):
+    loss = 'sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy'
+    model.compile(
+        optimizer=optimizers.Adam(learning_rate=learning_rate),
+        loss=loss,
+        metrics=['accuracy']
+    )
+    return model

src/text_preprocessing.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import re
+import string
+from typing import List, Optional, Union, Dict, Any, Callable
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk import download as nltk_download
+from nltk.stem import WordNetLemmatizer
+import spacy
+from gensim.models import KeyedVectors
+from transformers import AutoTokenizer, AutoModel
+import torch
+import emoji
+print('PREPROCESSING IMPORTED')
+try:
+    nltk_download('punkt', quiet=True)
+    nltk_download('stopwords', quiet=True)
+    nltk_download('wordnet', quiet=True)
+except Exception as e:
+    print(f"Warning: NLTK data download failed: {e}")
+_SPACY_MODEL = None
+_NLTK_LEMMATIZER = None
+_BERT_TOKENIZER = None
+_BERT_MODEL = None
+def _load_spacy_model(lang: str = "en_core_web_sm"):
+    global _SPACY_MODEL
+    if _SPACY_MODEL is None:
+        try:
+            _SPACY_MODEL = spacy.load(lang)
+        except OSError:
+            raise ValueError(
+                f"spaCy model '{lang}' not found. Please install it via: python -m spacy download {lang}"
+            )
+    return _SPACY_MODEL
+def _load_nltk_lemmatizer():
+    global _NLTK_LEMMATIZER
+    if _NLTK_LEMMATIZER is None:
+        _NLTK_LEMMATIZER = WordNetLemmatizer()
+    return _NLTK_LEMMATIZER
+def _load_bert_model(model_name: str = "bert-base-uncased"):
+    global _BERT_TOKENIZER, _BERT_MODEL
+    if _BERT_TOKENIZER is None or _BERT_MODEL is None:
+        _BERT_TOKENIZER = AutoTokenizer.from_pretrained(model_name)
+        _BERT_MODEL = AutoModel.from_pretrained(model_name)
+    return _BERT_TOKENIZER, _BERT_MODEL
+def clean_text(text: str) -> str:
+    text = re.sub(r"<[^>]+>", "", text)
+    text = re.sub(r"https?://\S+|www\.\S+", "", text)
+    text = "".join(ch for ch in text if ch in string.printable)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def replace_emojis(text: str) -> str:
+    return emoji.demojize(text, delimiters=(" ", " "))
+def preprocess_text(
+        text: str,
+        lang: str = "en",
+        remove_stopwords: bool = True,
+        use_spacy: bool = True,
+        lemmatize: bool = True,
+        emoji_to_text: bool = True,
+        lowercase: bool = True,
+        spacy_model: Optional[str] = None,
+        replace_entities: bool = False  # ← новая опция: по умолчанию НЕ заменяем числа/URL
+) -> List[str]:
+    import re
+    import string
+    if emoji_to_text:
+        text = replace_emojis(text)
+    text = re.sub(r"<[^>]+>", "", text)
+    text = re.sub(r"[^\w\s]", " ", text)  # заменяем НЕ-слова и НЕ-пробелы на пробел
+    text = re.sub(r"\s+", " ", text).strip()
+    if replace_entities:
+        text = re.sub(r"\b\d+\b", "<NUM>", text)
+        text = re.sub(r"https?://\S+|www\.\S+", "<URL>", text)
+        text = re.sub(r"\S+@\S+", "<EMAIL>", text)
+    if lowercase:
+        text = text.lower()
+    if use_spacy:
+        spacy_lang = spacy_model or ("en_core_web_sm" if lang == "en" else f"{lang}_core_news_sm")
+        nlp = _load_spacy_model(spacy_lang)
+        doc = nlp(text)
+        if lemmatize:
+            tokens = [token.lemma_ for token in doc if not token.is_space and not token.is_punct]
+        else:
+            tokens = [token.text for token in doc if not token.is_space and not token.is_punct]
+        if remove_stopwords:
+            tokens = [token for token in tokens if not nlp.vocab[token].is_stop]
+    else:
+        tokens = word_tokenize(text)
+        if lemmatize:
+            lemmatizer = _load_nltk_lemmatizer()
+            tokens = [lemmatizer.lemmatize(token) for token in tokens]
+        if remove_stopwords:
+            stop_words = set(stopwords.words(lang)) if lang in stopwords.fileids() else set()
+            tokens = [token for token in tokens if token not in stop_words]
+    tokens = [token for token in tokens if token not in string.punctuation and len(token) > 0]
+    return tokens
+class TextVectorizer:
+    def __init__(self):
+        self.bow_vectorizer = None
+        self.tfidf_vectorizer = None
+    def bow(self, texts: List[str], **kwargs) -> np.ndarray:
+        self.bow_vectorizer = CountVectorizer(**kwargs)
+        return self.bow_vectorizer.fit_transform(texts).toarray()
+    def tfidf(self, texts: List[str], max_features: int = 5000, **kwargs) -> np.ndarray:
+        kwargs['max_features'] = max_features
+        self.tfidf_vectorizer = TfidfVectorizer(lowercase=False, **kwargs)
+        return self.tfidf_vectorizer.fit_transform(texts).toarray()
+    def ngrams(self, texts: List[str], ngram_range: tuple = (1, 2), **kwargs) -> np.ndarray:
+        kwargs.setdefault("ngram_range", ngram_range)
+        return self.tfidf(texts, **kwargs)
+class EmbeddingVectorizer:
+    def __init__(self):
+        self.word2vec_model = None
+        self.fasttext_model = None
+        self.glove_vectors = None
+    def load_word2vec(self, path: str):
+        self.word2vec_model = KeyedVectors.load_word2vec_format(path, binary=True)
+    def load_fasttext(self, path: str):
+        self.fasttext_model = KeyedVectors.load(path)
+    def load_glove(self, glove_file: str, vocab_size: int = 400000, dim: int = 300):
+        self.glove_vectors = {}
+        with open(glove_file, "r", encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                if i >= vocab_size:
+                    break
+                values = line.split()
+                word = values[0]
+                vector = np.array(values[1:], dtype="float32")
+                self.glove_vectors[word] = vector
+    def _get_word_vector(self, word: str, method: str = "word2vec") -> Optional[np.ndarray]:
+        if method == "word2vec" and self.word2vec_model and word in self.word2vec_model:
+            return self.word2vec_model[word]
+        elif method == "fasttext" and self.fasttext_model and word in self.fasttext_model:
+            return self.fasttext_model[word]
+        elif method == "glove" and self.glove_vectors and word in self.glove_vectors:
+            return self.glove_vectors[word]
+        return None
+    def _aggregate_vectors(
+            self, vectors: List[np.ndarray], strategy: str = "mean"
+    ) -> np.ndarray:
+        if not vectors:
+            return np.zeros(300)  # default dim
+        if strategy == "mean":
+            return np.mean(vectors, axis=0)
+        elif strategy == "max":
+            return np.max(vectors, axis=0)
+        else:
+            raise ValueError("Strategy must be 'mean' or 'max'")
+    def get_embeddings(
+            self,
+            tokenized_texts: List[List[str]],
+            method: str = "word2vec",
+            aggregation: str = "mean",
+    ) -> np.ndarray:
+        embeddings = []
+        for tokens in tokenized_texts:
+            vectors = [
+                self._get_word_vector(token, method=method) for token in tokens
+            ]
+            vectors = [v for v in vectors if v is not None]
+            doc_vec = self._aggregate_vectors(vectors, strategy=aggregation)
+            embeddings.append(doc_vec)
+        return np.array(embeddings)
+def get_contextual_embeddings(
+        texts: List[str],
+        model_name: str = "bert-base-uncased",
+        aggregation: str = "mean",
+        device: str = "cpu",
+) -> np.ndarray:
+    tokenizer, model = _load_bert_model(model_name)
+    model.to(device)
+    model.eval()
+    embeddings = []
+    with torch.no_grad():
+        for text in texts:
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=512,
+            )
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            outputs = model(**inputs)
+            token_embeddings = outputs.last_hidden_state[0].cpu().numpy()
+            # Exclude [CLS] and [SEP] if needed (simple heuristic: skip first and last)
+            if len(token_embeddings) > 2:
+                token_embeddings = token_embeddings[1:-1]
+            if aggregation == "mean":
+                doc_emb = np.mean(token_embeddings, axis=0)
+            elif aggregation == "max":
+                doc_emb = np.max(token_embeddings, axis=0)
+            else:
+                raise ValueError("aggregation must be 'mean' or 'max'")
+            embeddings.append(doc_emb)
+    return np.array(embeddings)
+def extract_meta_features(texts: Union[List[str], pd.Series]) -> pd.DataFrame:
+    if isinstance(texts, pd.Series):
+        texts = texts.tolist()
+    features = []
+    for text in texts:
+        original_len = len(text)
+        words = text.split()
+        word_lengths = [len(w) for w in words] if words else [0]
+        avg_word_len = np.mean(word_lengths)
+        num_unique_words = len(set(words)) if words else 0
+        num_punct = sum(1 for c in text if c in string.punctuation)
+        num_upper = sum(1 for c in text if c.isupper())
+        num_digits = sum(1 for c in text if c.isdigit())
+        try:
+            flesch = np.nan
+        except Exception:
+            flesch = np.nan
+        features.append({
+            "text_length": original_len,
+            "avg_word_length": avg_word_len,
+            "num_unique_words": num_unique_words,
+            "punctuation_ratio": num_punct / original_len if original_len > 0 else 0,
+            "uppercase_ratio": num_upper / original_len if original_len > 0 else 0,
+            "digit_ratio": num_digits / original_len if original_len > 0 else 0,
+            "flesch_reading_ease": flesch,
+        })
+    return pd.DataFrame(features)