Spaces:

vigneshwark
/

focusflow-api

Sleeping

focusflow-api / scripts /train_classifier.py

vickysrm

Initial backend upload

d309047 12 days ago

3.81 kB

	import os
	import argparse
	from datasets import load_dataset
	from transformers import (
	DistilBertForSequenceClassification,
	DistilBertTokenizerFast,
	Trainer,
	TrainingArguments,
	)
	import numpy as np
	import evaluate

	def compute_metrics(eval_pred):
	metric = evaluate.load("accuracy")
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	return metric.compute(predictions=predictions, references=labels)

	def train_classifier():
	parser = argparse.ArgumentParser(description="Fine-tune DistilBERT on AMI Corpus for action item classification")
	parser.add_argument("--output_dir", type=str, default="./focusflow-classifier", help="Directory to save the fine-tuned model")
	parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
	parser.add_argument("--batch_size", type=int, default=16, help="Batch size for training")
	args = parser.parse_args()

	print("Loading tokenizer and model...")
	tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

	# 4 classes: action_item (0), decision (1), open_question (2), general (3)
	model = DistilBertForSequenceClassification.from_pretrained(
	"distilbert-base-uncased",
	num_labels=4,
	id2label={0: "action_item", 1: "decision", 2: "open_question", 3: "general"},
	label2id={"action_item": 0, "decision": 1, "open_question": 2, "general": 3}
	)

	print("Loading AMI Meeting Corpus dataset...")
	# NOTE: ami corpus requires specific configuration to extract utterance types.
	# We load a fallback sample dataset format to simulate the AMI structure if it's not readily partitioned.
	# In a real environment, you'd map the AMI dialog acts to these 4 labels.
	try:
	# For demonstration purposes, we create a small mock dataset based on what AMI structure would yield
	# since downloading and parsing raw AMI via HuggingFace can take exceptionally long.
	import datasets
	mock_data = {
	"text": [
	"I will send you the report by tomorrow.",
	"We have agreed to move forward with the redesign.",
	"Who will be responsible for the backend integration?",
	"The weather is nice today.",
	"Let's assign John to the database task.",
	"We decided to use React for the frontend."
	],
	"label": [0, 1, 2, 3, 0, 1]
	}
	dataset = datasets.Dataset.from_dict(mock_data)
	dataset = dataset.train_test_split(test_size=0.2)
	except Exception as e:
	print(f"Failed to load datasets: {e}")
	return

	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

	print("Tokenizing datasets...")
	tokenized_datasets = dataset.map(tokenize_function, batched=True)

	training_args = TrainingArguments(
	output_dir=args.output_dir,
	evaluation_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=args.batch_size,
	per_device_eval_batch_size=args.batch_size,
	num_train_epochs=args.epochs,
	weight_decay=0.01,
	save_total_limit=2,
	)

	print("Initializing Trainer...")
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["test"],
	compute_metrics=compute_metrics,
	)

	print("Starting training...")
	trainer.train()

	print(f"Saving final model to {args.output_dir}")
	trainer.save_model(args.output_dir)
	tokenizer.save_pretrained(args.output_dir)
	print("Fine-tuning complete!")

	if __name__ == "__main__":
	train_classifier()