Add training script

47c017c verified 11 days ago

11.3 kB

	"""
	Social Media Sentiment Analysis - End-to-End Training Pipeline
	Fine-tunes DeBERTa-v3-base on SST-2 + Tweet Sentiment for 95%+ accuracy
	Based on: DeBERTaV3 paper (arxiv:2111.09543) training recipe
	Optimized for CPU training with gradient accumulation
	"""

	import os
	import json
	import numpy as np
	import torch
	from datasets import load_dataset, concatenate_datasets
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer,
	DataCollatorWithPadding,
	EarlyStoppingCallback,
	)
	import evaluate
	import time

	# ── Configuration ──────────────────────────────────────────────────
	MODEL_NAME = "microsoft/deberta-v3-base"
	HUB_MODEL_ID = "rajvivan/deberta-v3-sentiment-analysis"
	MAX_LENGTH = 128 # Shorter for CPU efficiency
	NUM_EPOCHS = 3
	LEARNING_RATE = 2e-5
	TRAIN_BATCH_SIZE = 8 # Small for CPU RAM
	EVAL_BATCH_SIZE = 16
	GRADIENT_ACCUM = 4 # Effective batch = 32
	WARMUP_STEPS = 300
	WEIGHT_DECAY = 0.01
	SEED = 42

	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	use_fp16 = torch.cuda.is_available()

	print(f"🚀 Starting Sentiment Analysis Training")
	print(f" Model: {MODEL_NAME}")
	print(f" Hub ID: {HUB_MODEL_ID}")
	print(f" Device: {device}")
	print(f" Effective batch size: {TRAIN_BATCH_SIZE * GRADIENT_ACCUM}")

	start_time = time.time()

	# ── 1. Load and Prepare Datasets ──────────────────────────────────
	print("\n📦 Loading datasets...")

	sst2 = load_dataset("stanfordnlp/sst2")
	tweets = load_dataset("mteb/tweet_sentiment_extraction")

	print(f" SST-2 train: {len(sst2['train'])} \| val: {len(sst2['validation'])}")
	print(f" Tweet train: {len(tweets['train'])} \| test: {len(tweets['test'])}")

	# ── 2. Tokenizer ──────────────────────────────────────────────────
	print("\n🔤 Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	def preprocess_sst2(examples):
	return tokenizer(examples["sentence"], truncation=True, max_length=MAX_LENGTH, padding=False)

	def preprocess_tweet_text(text):
	"""Twitter-specific preprocessing (TimeLM paper)"""
	if not text:
	return ""
	return " ".join(
	'@user' if t.startswith('@') and len(t) > 1 else ('http' if t.startswith('http') else t)
	for t in text.split(" ")
	)

	def preprocess_tweets(examples):
	texts = [preprocess_tweet_text(t) for t in examples["text"]]
	return tokenizer(texts, truncation=True, max_length=MAX_LENGTH, padding=False)

	# Tokenize SST-2
	print(" Tokenizing SST-2...")
	sst2_tok = sst2.map(preprocess_sst2, batched=True, num_proc=2)
	sst2_tok = sst2_tok.remove_columns(["idx", "sentence"])

	# Prepare tweets: binary (remove neutral, remap labels)
	print(" Preparing tweet binary data...")
	tweets_train_bin = tweets["train"].filter(lambda x: x["label"] != 1)
	tweets_test_bin = tweets["test"].filter(lambda x: x["label"] != 1)

	def remap_labels(example):
	example["label"] = 1 if example["label"] == 2 else 0
	return example

	tweets_train_bin = tweets_train_bin.map(remap_labels)
	tweets_test_bin = tweets_test_bin.map(remap_labels)

	tweets_train_tok = tweets_train_bin.map(preprocess_tweets, batched=True, num_proc=2)
	tweets_test_tok = tweets_test_bin.map(preprocess_tweets, batched=True, num_proc=2)
	tweets_train_tok = tweets_train_tok.remove_columns(["id", "text", "label_text"])
	tweets_test_tok = tweets_test_bin.map(preprocess_tweets, batched=True, num_proc=2)
	tweets_test_tok = tweets_test_tok.remove_columns(["id", "text", "label_text"])

	# Cast label types to match SST-2 ClassLabel
	from datasets import ClassLabel, Features, Value
	target_features = sst2_tok["train"].features
	tweets_train_tok = tweets_train_tok.cast(target_features)
	tweets_test_tok = tweets_test_tok.cast(target_features)

	# Combine for training
	combined_train = concatenate_datasets([sst2_tok["train"], tweets_train_tok])
	print(f" Combined train: {len(combined_train)} samples")
	print(f" SST-2 val: {len(sst2_tok['validation'])} samples")
	print(f" Tweet test: {len(tweets_test_tok)} samples")

	# ── 3. Model ──────────────────────────────────────────────────────
	print("\n🧠 Loading model...")
	id2label = {0: "NEGATIVE", 1: "POSITIVE"}
	label2id = {"NEGATIVE": 0, "POSITIVE": 1}

	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME, num_labels=2, id2label=id2label, label2id=label2id,
	)
	total_params = model.num_parameters()
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f" Total params: {total_params:,}")
	print(f" Trainable: {trainable:,}")

	# ── 4. Metrics ────────────────────────────────────────────────────
	accuracy_metric = evaluate.load("accuracy")
	f1_metric = evaluate.load("f1")
	precision_metric = evaluate.load("precision")
	recall_metric = evaluate.load("recall")

	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	preds = np.argmax(logits, axis=-1)
	return {
	"accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
	"f1": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"],
	"precision": precision_metric.compute(predictions=preds, references=labels, average="weighted")["precision"],
	"recall": recall_metric.compute(predictions=preds, references=labels, average="weighted")["recall"],
	}

	def preprocess_logits_for_metrics(logits, labels):
	if isinstance(logits, tuple):
	logits = logits[0]
	return logits.argmax(dim=-1)

	# ── 5. Training ───────────────────────────────────────────────────
	print("\n⚙️ Configuring training...")
	data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

	training_args = TrainingArguments(
	output_dir="./deberta-sentiment-output",
	hub_model_id=HUB_MODEL_ID,
	push_to_hub=True,
	num_train_epochs=NUM_EPOCHS,
	per_device_train_batch_size=TRAIN_BATCH_SIZE,
	per_device_eval_batch_size=EVAL_BATCH_SIZE,
	gradient_accumulation_steps=GRADIENT_ACCUM,
	learning_rate=LEARNING_RATE,
	weight_decay=WEIGHT_DECAY,
	warmup_steps=WARMUP_STEPS,
	lr_scheduler_type="linear",
	max_grad_norm=1.0,
	eval_strategy="epoch",
	save_strategy="epoch",
	logging_strategy="steps",
	logging_steps=100,
	logging_first_step=True,
	disable_tqdm=True,
	load_best_model_at_end=True,
	metric_for_best_model="accuracy",
	greater_is_better=True,
	bf16=False,
	fp16=use_fp16,
	seed=SEED,
	report_to="none",
	save_total_limit=2,
	dataloader_num_workers=2,
	gradient_checkpointing=True, # Save memory
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=combined_train,
	eval_dataset=sst2_tok["validation"],
	processing_class=tokenizer,
	data_collator=data_collator,
	compute_metrics=compute_metrics,
	preprocess_logits_for_metrics=preprocess_logits_for_metrics,
	callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
	)

	# ── 6. Train ──────────────────────────────────────────────────────
	print("\n🏋️ Starting training...")
	print(f" Steps per epoch: ~{len(combined_train) // (TRAIN_BATCH_SIZE * GRADIENT_ACCUM)}")
	train_result = trainer.train()
	train_time = time.time() - start_time

	# ── 7. Evaluate ───────────────────────────────────────────────────
	print("\n📊 Evaluating on SST-2 validation...")
	sst2_results = trainer.evaluate(sst2_tok["validation"])
	print(f" Accuracy: {sst2_results['eval_accuracy']:.4f}")
	print(f" F1: {sst2_results['eval_f1']:.4f}")
	print(f" Precision: {sst2_results['eval_precision']:.4f}")
	print(f" Recall: {sst2_results['eval_recall']:.4f}")

	print("\n📊 Evaluating on Tweet Sentiment test set...")
	tweet_results = trainer.evaluate(tweets_test_tok)
	print(f" Accuracy: {tweet_results['eval_accuracy']:.4f}")
	print(f" F1: {tweet_results['eval_f1']:.4f}")
	print(f" Precision: {tweet_results['eval_precision']:.4f}")
	print(f" Recall: {tweet_results['eval_recall']:.4f}")

	# ── 8. Save results ──────────────────────────────────────────────
	os.makedirs("./deberta-sentiment-output", exist_ok=True)
	results = {
	"model": MODEL_NAME,
	"hub_model_id": HUB_MODEL_ID,
	"total_parameters": total_params,
	"training_config": {
	"learning_rate": LEARNING_RATE,
	"effective_batch_size": TRAIN_BATCH_SIZE * GRADIENT_ACCUM,
	"per_device_batch_size": TRAIN_BATCH_SIZE,
	"gradient_accumulation_steps": GRADIENT_ACCUM,
	"epochs": NUM_EPOCHS,
	"warmup_steps": WARMUP_STEPS,
	"weight_decay": WEIGHT_DECAY,
	"max_length": MAX_LENGTH,
	"seed": SEED,
	"gradient_checkpointing": True,
	},
	"datasets": {
	"sst2_train": len(sst2_tok["train"]),
	"tweet_train_binary": len(tweets_train_tok),
	"combined_train": len(combined_train),
	"sst2_val": len(sst2_tok["validation"]),
	"tweet_test_binary": len(tweets_test_tok),
	},
	"sst2_eval_results": {k.replace("eval_", ""): v for k, v in sst2_results.items()},
	"tweet_eval_results": {k.replace("eval_", ""): v for k, v in tweet_results.items()},
	"training_loss": train_result.training_loss,
	"training_time_seconds": train_time,
	}

	with open("./deberta-sentiment-output/results.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"\n💾 Results saved")

	# ── 9. Push to Hub ────────────────────────────────────────────────
	print("\n🚀 Pushing model to Hub...")
	trainer.push_to_hub(
	commit_message="DeBERTa-v3-base sentiment analysis - SST-2 + Tweet binary",
	tags=["sentiment-analysis", "deberta-v3", "social-media", "text-classification"],
	)
	print(f"✅ Model published: https://huggingface.co/{HUB_MODEL_ID}")

	# ── Summary ───────────────────────────────────────────────────────
	print("\n" + "="*60)
	print("TRAINING COMPLETE - FINAL RESULTS")
	print("="*60)
	print(f"Training time: {train_time/60:.1f} minutes")
	print(f"Training loss: {train_result.training_loss:.4f}")
	print(f"SST-2 Accuracy: {sst2_results['eval_accuracy']:.4f}")
	print(f"SST-2 F1: {sst2_results['eval_f1']:.4f}")
	print(f"Tweet Accuracy: {tweet_results['eval_accuracy']:.4f}")
	print(f"Tweet F1: {tweet_results['eval_f1']:.4f}")
	print("="*60)