Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import json | |
| import logging | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| Trainer, | |
| TrainingArguments, | |
| DataCollatorWithPadding | |
| ) | |
| from datasets import Dataset | |
| _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| if str(_PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(_PROJECT_ROOT)) | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s") | |
| logger = logging.getLogger("distilbert_model") | |
| def train_distilbert(cfg, splits_dir, save_dir): | |
| os.makedirs(save_dir, exist_ok=True) | |
| # 1. Load Data | |
| train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv")) | |
| val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv")) | |
| train_df["clean_text"] = train_df["clean_text"].fillna("") | |
| val_df["clean_text"] = val_df["clean_text"].fillna("") | |
| maxlen = cfg.get("preprocessing", {}).get("bert_max_len", 512) | |
| batch_size = cfg.get("training", {}).get("bert_batch_size", 16) | |
| epochs = cfg.get("training", {}).get("bert_epochs", 3) | |
| lr = float(cfg.get("training", {}).get("lr_learning_rate", 2e-5)) | |
| logger.info("Loading DistilBERT tokenizer...") | |
| model_name = "distilbert-base-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # 2. Tokenization Helper | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], padding=False, truncation=True, max_length=maxlen) | |
| # 3. Create OOF Proxy Split (80/20) safely to accelerate pipeline training (avoid 5-fold computation cost) | |
| idx_train, idx_meta_val = train_test_split( | |
| range(len(train_df)), test_size=0.20, | |
| stratify=train_df["binary_label"], random_state=42 | |
| ) | |
| subset_train_df = train_df.iloc[idx_train].copy() | |
| # 4. Convert to HuggingFace Datasets | |
| hf_sub_train = Dataset.from_pandas(pd.DataFrame({ | |
| "text": subset_train_df["clean_text"], "labels": subset_train_df["binary_label"] | |
| }), preserve_index=False) | |
| hf_full_train = Dataset.from_pandas(pd.DataFrame({ | |
| "text": train_df["clean_text"], "labels": train_df["binary_label"] | |
| }), preserve_index=False) | |
| hf_val = Dataset.from_pandas(pd.DataFrame({ | |
| "text": val_df["clean_text"], "labels": val_df["binary_label"] | |
| }), preserve_index=False) | |
| logger.info("Tokenizing datasets...") | |
| hf_sub_train = hf_sub_train.map(tokenize_function, batched=True) | |
| hf_full_train = hf_full_train.map(tokenize_function, batched=True) | |
| hf_val = hf_val.map(tokenize_function, batched=True) | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # 5. Initialize Model | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) | |
| # 6. Trainer Setup | |
| training_args = TrainingArguments( | |
| output_dir=os.path.join(save_dir, "checkpoints"), | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| learning_rate=lr, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| gradient_accumulation_steps=2, | |
| dataloader_num_workers=2, | |
| num_train_epochs=epochs, | |
| weight_decay=0.01, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| fp16=torch.cuda.is_available(), | |
| disable_tqdm=False | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=hf_sub_train, | |
| eval_dataset=hf_val, | |
| processing_class=tokenizer, | |
| data_collator=data_collator, | |
| ) | |
| # 7. Train | |
| logger.info("Starting DistilBERT internal proxy training...") | |
| trainer.train() | |
| # 8. Save Model | |
| logger.info("Saving final fine-tuned model...") | |
| trainer.save_model(save_dir) | |
| tokenizer.save_pretrained(save_dir) | |
| # 9. Extract OOF over the entire training set | |
| logger.info("Generating OOF predictions on full train set proxy wrapper...") | |
| oof_preds = trainer.predict(hf_full_train) | |
| # probabilities for class 1 (True) | |
| oof_probas = torch.softmax(torch.tensor(oof_preds.predictions), dim=-1)[:, 1].numpy() | |
| np.save(os.path.join(save_dir, "distilbert_oof.npy"), oof_probas) | |
| logger.info("Saved distilbert_oof.npy") | |
| # Validation evaluation mapped later by main loop, or manually if desired. | |
| val_preds_out = trainer.predict(hf_val) | |
| val_probas = torch.softmax(torch.tensor(val_preds_out.predictions), dim=-1)[:, 1].numpy() | |
| from src.models.logistic_model import plot_and_save_cm | |
| plot_and_save_cm( | |
| val_df["binary_label"], | |
| (val_probas > 0.5).astype(int), | |
| os.path.join(save_dir, "cm.png"), | |
| title="DistilBERT Confusion Matrix" | |
| ) | |
| logger.info("DistilBERT Training completed!") | |
| # ==================================================================== | |
| # OPTIONAL: Full K-Fold OOF (GPU-intensive) | |
| # -------------------------------------------------------------------- | |
| # The strategy above saves enormous compute by generating a single | |
| # proxy model to predict the full training pool. A strict K-Fold | |
| # architecture requires training DistilBERT 5 entirely separate | |
| # instances which spans roughly 15+ epochs locally. Use below | |
| # if massive parallel A100s are available. | |
| # | |
| """ | |
| from sklearn.model_selection import StratifiedKFold | |
| def strict_kfold_distilbert(train_df, tokenize_function, data_collator, lr, batch_size, epochs, save_dir): | |
| skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| oof_probas = np.zeros(len(train_df), dtype=np.float32) | |
| for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df["binary_label"])): | |
| logger.info(f"Training Fold {fold+1}/5") | |
| df_train = train_df.iloc[train_idx].copy() | |
| df_val = train_df.iloc[val_idx].copy() | |
| ds_train = Dataset.from_pandas(pd.DataFrame({"text": df_train["clean_text"], "labels": df_train["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True) | |
| ds_val = Dataset.from_pandas(pd.DataFrame({"text": df_val["clean_text"], "labels": df_val["binary_label"]}), preserve_index=False).map(tokenize_function, batched=True) | |
| model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) | |
| training_args = TrainingArguments( | |
| output_dir=os.path.join(save_dir, f"fold_{fold}"), | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| learning_rate=lr, | |
| per_device_train_batch_size=batch_size, | |
| num_train_epochs=epochs, | |
| fp16=torch.cuda.is_available(), | |
| load_best_model_at_end=True, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=ds_train, | |
| eval_dataset=ds_val, | |
| data_collator=data_collator, | |
| ) | |
| trainer.train() | |
| fold_preds = trainer.predict(ds_val) | |
| oof_probas[val_idx] = torch.softmax(torch.tensor(fold_preds.predictions), dim=-1)[:, 1].numpy() | |
| np.save(os.path.join(save_dir, "distilbert_oof.npy"), oof_probas) | |
| """ | |
| # ==================================================================== | |
| if __name__ == "__main__": | |
| import yaml | |
| cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml") | |
| with open(cfg_path, "r", encoding="utf-8") as file: | |
| config = yaml.safe_load(file) | |
| s_dir = os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]) | |
| m_dir = os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"], "distilbert_model") | |
| train_distilbert(config, s_dir, m_dir) | |