| |
| |
|
|
| |
| model: |
| name: "Sheikh-2.5-Coder" |
| num_parameters: 3.09e9 |
| context_length: 32768 |
| vocab_size: 50257 |
| hidden_size: 3072 |
| num_attention_heads: 16 |
| num_key_value_heads: 2 |
| num_hidden_layers: 36 |
| intermediate_size: 8192 |
| activation: "swiglu" |
| layer_norm_epsilon: 1e-6 |
| max_position_embeddings: 32768 |
|
|
| |
| data: |
| total_tokens: 5.5e12 |
| sources: |
| - name: "the-stack-v2" |
| description: "Diverse programming language dataset" |
| percentage: 40 |
| - name: "github-code" |
| description: "High-quality GitHub repositories" |
| percentage: 25 |
| - name: "synthetic-code-data" |
| description: "AI-generated code examples" |
| percentage: 20 |
| - name: "natural-language" |
| description: "Code documentation and comments" |
| percentage: 15 |
|
|
| |
| training: |
| |
| learning_rate: 1.0e-4 |
| weight_decay: 0.01 |
| beta1: 0.9 |
| beta2: 0.95 |
| eps: 1.0e-8 |
| |
| |
| warmup_steps: 2000 |
| max_steps: 100000 |
| train_batch_size: 64 |
| gradient_accumulation_steps: 4 |
| max_grad_norm: 1.0 |
| |
| |
| fp16: true |
| bf16: true |
| tf32: true |
| |
| |
| dropout: 0.1 |
| attention_dropout: 0.1 |
| |
| |
| eval_steps: 1000 |
| save_steps: 2000 |
| logging_steps: 100 |
|
|
| |
| instruction_tuning: |
| enabled: true |
| data_sources: |
| - "code-instruct" |
| - "multi-turn-conversations" |
| - "programming-help" |
| learning_rate: 5.0e-6 |
| train_batch_size: 16 |
| max_sequence_length: 32768 |
|
|
| |
| efficiency: |
| flash_attention: true |
| gradient_checkpointing: true |
| deepspeed: false |
| fsdp: false |
| use_cache: true |
| rope_scaling: |
| type: "linear" |
| factor: 8.0 |
|
|
| |
| hardware: |
| gpus: 8 |
| gpu_type: "A100" |
| gpu_memory: "80GB" |
| host_memory: "1TB" |
| network: "infiniband" |
|
|
| |
| checkpointing: |
| save_total_limit: 3 |
| load_best_model_at_end: true |
| metric_for_best_model: "loss" |
| greater_is_better: false |
|
|
| |
| distributed: |
| world_size: 8 |
| rank: 0 |
| master_addr: "localhost" |
| master_port: 12355 |
|
|
| |
| logging: |
| wandb: |
| enabled: true |
| project: "sheikh-2.5-coder" |
| tensorboard: |
| enabled: true |
| log_dir: "./logs" |
| mlflow: |
| enabled: false |
|
|
| |
| evaluation: |
| benchmarks: |
| - name: "HumanEval" |
| evaluation_steps: 1000 |
| batch_size: 10 |
| - name: "MBPP" |
| evaluation_steps: 1000 |
| batch_size: 10 |
| - name: "MultiPL-E" |
| evaluation_steps: 2000 |
| batch_size: 5 |