| |
| import torch |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| TrainingArguments, |
| Trainer, |
| DataCollatorWithPadding |
| ) |
| from datasets import Dataset |
| import json |
| import logging |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| class AcoliTrainer: |
| def __init__(self, model_name="xlm-roberta-base", num_labels=3): |
| self.model_name = model_name |
| self.num_labels = num_labels |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
| self.model = AutoModelForSequenceClassification.from_pretrained( |
| model_name, |
| num_labels=num_labels |
| ) |
| |
| def load_data(self, jsonl_path): |
| """Load data from JSONL file""" |
| texts = [] |
| labels = [] |
| |
| with open(jsonl_path, 'r', encoding='utf-8') as f: |
| for line in f: |
| data = json.loads(line) |
| texts.append(data['text']) |
| labels.append(data['label']) |
| |
| return Dataset.from_dict({ |
| 'text': texts, |
| 'label': labels |
| }) |
| |
| def preprocess_function(self, examples): |
| """Tokenize the texts""" |
| return self.tokenizer( |
| examples['text'], |
| truncation=True, |
| padding=True, |
| max_length=512 |
| ) |
| |
| def train(self, train_path, output_dir="./acoli-model"): |
| """Train the model""" |
| |
| |
| logger.info("Loading training data...") |
| dataset = self.load_data(train_path) |
| tokenized_dataset = dataset.map(self.preprocess_function, batched=True) |
| |
| |
| train_test_split = tokenized_dataset.train_test_split(test_size=0.2) |
| train_dataset = train_test_split['train'] |
| eval_dataset = train_test_split['test'] |
| |
| |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| learning_rate=2e-5, |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| num_train_epochs=3, |
| weight_decay=0.01, |
| evaluation_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| push_to_hub=False, |
| ) |
| |
| |
| data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) |
| |
| |
| trainer = Trainer( |
| model=self.model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| tokenizer=self.tokenizer, |
| data_collator=data_collator, |
| ) |
| |
| |
| logger.info("Starting training...") |
| trainer.train() |
| |
| |
| logger.info(f"Saving model to {output_dir}") |
| trainer.save_model(output_dir) |
| self.tokenizer.save_pretrained(output_dir) |
| |
| return trainer |
|
|
| if __name__ == "__main__": |
| |
| trainer = AcoliTrainer() |
| |
| |
| trained_trainer = trainer.train("path/to/your/data.jsonl") |
| |
| print("Training completed successfully!") |