File size: 11,277 Bytes

47c017c

"""
Social Media Sentiment Analysis - End-to-End Training Pipeline
Fine-tunes DeBERTa-v3-base on SST-2 + Tweet Sentiment for 95%+ accuracy
Based on: DeBERTaV3 paper (arxiv:2111.09543) training recipe
Optimized for CPU training with gradient accumulation
"""

import os
import json
import numpy as np
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
import evaluate
import time

# ── Configuration ──────────────────────────────────────────────────
MODEL_NAME = "microsoft/deberta-v3-base"
HUB_MODEL_ID = "rajvivan/deberta-v3-sentiment-analysis"
MAX_LENGTH = 128  # Shorter for CPU efficiency
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5
TRAIN_BATCH_SIZE = 8  # Small for CPU RAM
EVAL_BATCH_SIZE = 16
GRADIENT_ACCUM = 4  # Effective batch = 32
WARMUP_STEPS = 300
WEIGHT_DECAY = 0.01
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
use_fp16 = torch.cuda.is_available()

print(f"🚀 Starting Sentiment Analysis Training")
print(f"   Model: {MODEL_NAME}")
print(f"   Hub ID: {HUB_MODEL_ID}")
print(f"   Device: {device}")
print(f"   Effective batch size: {TRAIN_BATCH_SIZE * GRADIENT_ACCUM}")

start_time = time.time()

# ── 1. Load and Prepare Datasets ──────────────────────────────────
print("\n📦 Loading datasets...")

sst2 = load_dataset("stanfordnlp/sst2")
tweets = load_dataset("mteb/tweet_sentiment_extraction")

print(f"   SST-2 train: {len(sst2['train'])} | val: {len(sst2['validation'])}")
print(f"   Tweet train: {len(tweets['train'])} | test: {len(tweets['test'])}")

# ── 2. Tokenizer ──────────────────────────────────────────────────
print("\n🔤 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_sst2(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=MAX_LENGTH, padding=False)

def preprocess_tweet_text(text):
    """Twitter-specific preprocessing (TimeLM paper)"""
    if not text:
        return ""
    return " ".join(
        '@user' if t.startswith('@') and len(t) > 1 else ('http' if t.startswith('http') else t)
        for t in text.split(" ")
    )

def preprocess_tweets(examples):
    texts = [preprocess_tweet_text(t) for t in examples["text"]]
    return tokenizer(texts, truncation=True, max_length=MAX_LENGTH, padding=False)

# Tokenize SST-2
print("   Tokenizing SST-2...")
sst2_tok = sst2.map(preprocess_sst2, batched=True, num_proc=2)
sst2_tok = sst2_tok.remove_columns(["idx", "sentence"])

# Prepare tweets: binary (remove neutral, remap labels)
print("   Preparing tweet binary data...")
tweets_train_bin = tweets["train"].filter(lambda x: x["label"] != 1)
tweets_test_bin = tweets["test"].filter(lambda x: x["label"] != 1)

def remap_labels(example):
    example["label"] = 1 if example["label"] == 2 else 0
    return example

tweets_train_bin = tweets_train_bin.map(remap_labels)
tweets_test_bin = tweets_test_bin.map(remap_labels)

tweets_train_tok = tweets_train_bin.map(preprocess_tweets, batched=True, num_proc=2)
tweets_test_tok = tweets_test_bin.map(preprocess_tweets, batched=True, num_proc=2)
tweets_train_tok = tweets_train_tok.remove_columns(["id", "text", "label_text"])
tweets_test_tok = tweets_test_bin.map(preprocess_tweets, batched=True, num_proc=2)
tweets_test_tok = tweets_test_tok.remove_columns(["id", "text", "label_text"])

# Cast label types to match SST-2 ClassLabel
from datasets import ClassLabel, Features, Value
target_features = sst2_tok["train"].features
tweets_train_tok = tweets_train_tok.cast(target_features)
tweets_test_tok = tweets_test_tok.cast(target_features)

# Combine for training
combined_train = concatenate_datasets([sst2_tok["train"], tweets_train_tok])
print(f"   Combined train: {len(combined_train)} samples")
print(f"   SST-2 val: {len(sst2_tok['validation'])} samples")
print(f"   Tweet test: {len(tweets_test_tok)} samples")

# ── 3. Model ──────────────────────────────────────────────────────
print("\n🧠 Loading model...")
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id,
)
total_params = model.num_parameters()
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"   Total params: {total_params:,}")
print(f"   Trainable: {trainable:,}")

# ── 4. Metrics ────────────────────────────────────────────────────
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "precision": precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

# ── 5. Training ───────────────────────────────────────────────────
print("\n⚙️ Configuring training...")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./deberta-sentiment-output",
    hub_model_id=HUB_MODEL_ID,
    push_to_hub=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUM,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,
    lr_scheduler_type="linear",
    max_grad_norm=1.0,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    logging_first_step=True,
    disable_tqdm=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    bf16=False,
    fp16=use_fp16,
    seed=SEED,
    report_to="none",
    save_total_limit=2,
    dataloader_num_workers=2,
    gradient_checkpointing=True,  # Save memory
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_train,
    eval_dataset=sst2_tok["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# ── 6. Train ──────────────────────────────────────────────────────
print("\n🏋️ Starting training...")
print(f"   Steps per epoch: ~{len(combined_train) // (TRAIN_BATCH_SIZE * GRADIENT_ACCUM)}")
train_result = trainer.train()
train_time = time.time() - start_time

# ── 7. Evaluate ───────────────────────────────────────────────────
print("\n📊 Evaluating on SST-2 validation...")
sst2_results = trainer.evaluate(sst2_tok["validation"])
print(f"   Accuracy:  {sst2_results['eval_accuracy']:.4f}")
print(f"   F1:        {sst2_results['eval_f1']:.4f}")
print(f"   Precision: {sst2_results['eval_precision']:.4f}")
print(f"   Recall:    {sst2_results['eval_recall']:.4f}")

print("\n📊 Evaluating on Tweet Sentiment test set...")
tweet_results = trainer.evaluate(tweets_test_tok)
print(f"   Accuracy:  {tweet_results['eval_accuracy']:.4f}")
print(f"   F1:        {tweet_results['eval_f1']:.4f}")
print(f"   Precision: {tweet_results['eval_precision']:.4f}")
print(f"   Recall:    {tweet_results['eval_recall']:.4f}")

# ── 8. Save results ──────────────────────────────────────────────
os.makedirs("./deberta-sentiment-output", exist_ok=True)
results = {
    "model": MODEL_NAME,
    "hub_model_id": HUB_MODEL_ID,
    "total_parameters": total_params,
    "training_config": {
        "learning_rate": LEARNING_RATE,
        "effective_batch_size": TRAIN_BATCH_SIZE * GRADIENT_ACCUM,
        "per_device_batch_size": TRAIN_BATCH_SIZE,
        "gradient_accumulation_steps": GRADIENT_ACCUM,
        "epochs": NUM_EPOCHS,
        "warmup_steps": WARMUP_STEPS,
        "weight_decay": WEIGHT_DECAY,
        "max_length": MAX_LENGTH,
        "seed": SEED,
        "gradient_checkpointing": True,
    },
    "datasets": {
        "sst2_train": len(sst2_tok["train"]),
        "tweet_train_binary": len(tweets_train_tok),
        "combined_train": len(combined_train),
        "sst2_val": len(sst2_tok["validation"]),
        "tweet_test_binary": len(tweets_test_tok),
    },
    "sst2_eval_results": {k.replace("eval_", ""): v for k, v in sst2_results.items()},
    "tweet_eval_results": {k.replace("eval_", ""): v for k, v in tweet_results.items()},
    "training_loss": train_result.training_loss,
    "training_time_seconds": train_time,
}

with open("./deberta-sentiment-output/results.json", "w") as f:
    json.dump(results, f, indent=2)
print(f"\n💾 Results saved")

# ── 9. Push to Hub ────────────────────────────────────────────────
print("\n🚀 Pushing model to Hub...")
trainer.push_to_hub(
    commit_message="DeBERTa-v3-base sentiment analysis - SST-2 + Tweet binary",
    tags=["sentiment-analysis", "deberta-v3", "social-media", "text-classification"],
)
print(f"✅ Model published: https://huggingface.co/{HUB_MODEL_ID}")

# ── Summary ───────────────────────────────────────────────────────
print("\n" + "="*60)
print("TRAINING COMPLETE - FINAL RESULTS")
print("="*60)
print(f"Training time:   {train_time/60:.1f} minutes")
print(f"Training loss:   {train_result.training_loss:.4f}")
print(f"SST-2 Accuracy:  {sst2_results['eval_accuracy']:.4f}")
print(f"SST-2 F1:        {sst2_results['eval_f1']:.4f}")
print(f"Tweet Accuracy:  {tweet_results['eval_accuracy']:.4f}")
print(f"Tweet F1:        {tweet_results['eval_f1']:.4f}")
print("="*60)