Spaces:

vigneshwark
/

focusflow-api

Sleeping

File size: 3,813 Bytes

d309047

import os
import argparse
from datasets import load_dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    Trainer,
    TrainingArguments,
)
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def train_classifier():
    parser = argparse.ArgumentParser(description="Fine-tune DistilBERT on AMI Corpus for action item classification")
    parser.add_argument("--output_dir", type=str, default="./focusflow-classifier", help="Directory to save the fine-tuned model")
    parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--batch_size", type=int, default=16, help="Batch size for training")
    args = parser.parse_args()

    print("Loading tokenizer and model...")
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    
    # 4 classes: action_item (0), decision (1), open_question (2), general (3)
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=4,
        id2label={0: "action_item", 1: "decision", 2: "open_question", 3: "general"},
        label2id={"action_item": 0, "decision": 1, "open_question": 2, "general": 3}
    )

    print("Loading AMI Meeting Corpus dataset...")
    # NOTE: ami corpus requires specific configuration to extract utterance types.
    # We load a fallback sample dataset format to simulate the AMI structure if it's not readily partitioned.
    # In a real environment, you'd map the AMI dialog acts to these 4 labels.
    try:
        # For demonstration purposes, we create a small mock dataset based on what AMI structure would yield
        # since downloading and parsing raw AMI via HuggingFace can take exceptionally long.
        import datasets
        mock_data = {
            "text": [
                "I will send you the report by tomorrow.", 
                "We have agreed to move forward with the redesign.",
                "Who will be responsible for the backend integration?",
                "The weather is nice today.",
                "Let's assign John to the database task.",
                "We decided to use React for the frontend."
            ],
            "label": [0, 1, 2, 3, 0, 1]
        }
        dataset = datasets.Dataset.from_dict(mock_data)
        dataset = dataset.train_test_split(test_size=0.2)
    except Exception as e:
        print(f"Failed to load datasets: {e}")
        return

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

    print("Tokenizing datasets...")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        weight_decay=0.01,
        save_total_limit=2,
    )

    print("Initializing Trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    trainer.train()

    print(f"Saving final model to {args.output_dir}")
    trainer.save_model(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    print("Fine-tuning complete!")

if __name__ == "__main__":
    train_classifier()