| |
| """ |
| Sheikh-2.5-Coder Training Script |
| ================================ |
| |
| This script handles the training pipeline for Sheikh-2.5-Coder model. |
| """ |
|
|
| import os |
| import torch |
| import argparse |
| from typing import Optional |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForCausalLM, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForSeq2Seq, |
| get_linear_schedule_with_warmup, |
| ) |
| from datasets import load_dataset, Dataset |
| import yaml |
| from model import SheikhModel, SheikhConfig, setup_training_args |
|
|
| def load_config(config_path: str) -> dict: |
| """Load training configuration from YAML file.""" |
| with open(config_path, 'r') as f: |
| return yaml.safe_load(f) |
|
|
| def prepare_training_data(data_config: dict) -> Dataset: |
| """Prepare training dataset.""" |
| |
| |
| print("Loading training data...") |
| |
| |
| |
| |
| |
| |
| train_dataset = Dataset.from_dict({ |
| 'input_ids': [[1, 2, 3, 4, 5]], |
| 'attention_mask': [[1, 1, 1, 1, 1]], |
| 'labels': [[2, 3, 4, 5, 6]] |
| }) |
| |
| return train_dataset |
|
|
| def setup_model_and_tokenizer(config: dict) -> tuple: |
| """Setup model and tokenizer.""" |
| print("Initializing model and tokenizer...") |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B") |
| |
| |
| model_config = SheikhConfig( |
| vocab_size=tokenizer.vocab_size, |
| hidden_size=config['model']['hidden_size'], |
| num_attention_heads=config['model']['num_attention_heads'], |
| num_key_value_heads=config['model']['num_key_value_heads'], |
| num_hidden_layers=config['model']['num_hidden_layers'], |
| intermediate_size=config['model']['intermediate_size'], |
| max_position_embeddings=config['model']['context_length'], |
| ) |
| |
| |
| model = SheikhModel(model_config) |
| |
| |
| model.resize_token_embeddings(len(tokenizer)) |
| |
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| return model, tokenizer |
|
|
| def train_model( |
| model, |
| tokenizer, |
| train_dataset, |
| eval_dataset, |
| config: dict, |
| output_dir: str, |
| ): |
| """Train the Sheikh-2.5-Coder model.""" |
| |
| |
| training_config = config['training'] |
| args = setup_training_args( |
| output_dir=output_dir, |
| learning_rate=training_config['learning_rate'] |
| ) |
| |
| |
| data_collator = DataCollatorForSeq2Seq( |
| tokenizer=tokenizer, |
| model=model, |
| padding=True, |
| return_tensors="pt", |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=data_collator, |
| tokenizer=tokenizer, |
| ) |
| |
| |
| print("Starting training...") |
| trainer.train() |
| |
| |
| trainer.save_model(output_dir) |
| tokenizer.save_pretrained(output_dir) |
| |
| print(f"Training completed! Model saved to {output_dir}") |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model") |
| parser.add_argument( |
| "--config", |
| type=str, |
| default="training_config.yaml", |
| help="Path to training configuration file" |
| ) |
| parser.add_argument( |
| "--output-dir", |
| type=str, |
| default="./sheikh-2.5-coder-output", |
| help="Directory to save the trained model" |
| ) |
| parser.add_argument( |
| "--resume-from-checkpoint", |
| type=str, |
| default=None, |
| help="Path to checkpoint to resume from" |
| ) |
| |
| args = parser.parse_args() |
| |
| |
| config = load_config(args.config) |
| |
| |
| model, tokenizer = setup_model_and_tokenizer(config) |
| |
| |
| train_dataset = prepare_training_data(config['data']) |
| eval_dataset = prepare_training_data(config['data']) |
| |
| |
| os.makedirs(args.output_dir, exist_ok=True) |
| |
| |
| train_model( |
| model=model, |
| tokenizer=tokenizer, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| config=config, |
| output_dir=args.output_dir |
| ) |
|
|
| if __name__ == "__main__": |
| main() |