EVA / EVA_145M /training_config.yaml
yanjiehuang's picture
Upload folder using huggingface_hub
ec927b3 verified
data_config:
cache_size: 10000
dataset: train_30M_sqrt_s42
enable_reverse_augmentation: true
force_rebuild_index: false
glm_probability: 0.333
lineage_file: /rna-multiverse/data/training_data/lineage_greengenes.tsv
max_samples: null
max_seq_length: 8192
mode: mixed
span_config:
allow_overlap: false
coverage_probs:
- 0.28
- 0.3
- 0.28
- 0.14
distribution_probs:
- 0.3
- 0.5
- 0.2
max_coverage_ratios:
- 0.15
- 0.25
- 0.5
- 0.8
max_num_spans: 10
span_distributions:
- - 10
- 5
- - 20
- 10
- - 50
- 20
train_file: /rna-multiverse/data/cluster/sampling/training/train_30M_sqrt_s42_50only.fa
use_chunked: true
use_direction_tokens: true
use_lineage_prefix: true
distributed_config:
backend: nccl
data_parallel_size: 4
expert_parallel_size: 4
weight_parallel_size: 1
logging_config:
enable_wandb: true
log_dir: /rna-multiverse/results/experiments/scaling_tiny_6e18_v22_mixed_glm_stage1_20251124/logs
wandb_project: rna-lineage-stage1
wandb_run_name: scaling_tiny_6e18_v22_mixed_glm_stage1
memory_config:
cleanup_frequency: 100
enable_monitoring: true
gc_frequency: 50
model_config:
attention_dropout: 0.0
dropout_ramp_steps: 0
dropout_schedule: linear
dropout_warmup_steps: 0
expert_capacity_factor: 1.5
gradient_clip_norm: 0.0
hidden_dropout: 0.0
hidden_size: 448
initializer_range: 0.02
intermediate_size: 1344
label_smoothing: 0.0
max_position_embeddings: 8192
moe_implementation: megablocks
moe_world_size: 4
num_attention_heads: 7
num_experts: 8
num_experts_per_tok: 2
num_hidden_layers: 16
num_key_value_heads: 7
resid_dropout: 0.0
rms_norm_eps: 1e-6
router_aux_loss_coef: 0.01
use_cache: true
vocab_size: 114
model_sizes:
large:
hidden_size: 1280
intermediate_size: 3840
num_attention_heads: 16
num_hidden_layers: 24
num_key_value_heads: 16
large_minus:
hidden_size: 1024
intermediate_size: 3072
num_attention_heads: 16
num_hidden_layers: 22
num_key_value_heads: 16
medium:
hidden_size: 768
intermediate_size: 2048
num_attention_heads: 12
num_hidden_layers: 24
num_key_value_heads: 12
medium_minus:
hidden_size: 672
intermediate_size: 2016
num_attention_heads: 12
num_hidden_layers: 20
num_key_value_heads: 12
small:
hidden_size: 512
intermediate_size: 1536
num_attention_heads: 8
num_hidden_layers: 17
num_key_value_heads: 8
tiny:
hidden_size: 448
intermediate_size: 1344
num_attention_heads: 7
num_hidden_layers: 16
num_key_value_heads: 7
ultra_tiny:
hidden_size: 256
intermediate_size: 768
num_attention_heads: 8
num_hidden_layers: 6
num_key_value_heads: 8
training_config:
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-8
bf16: true
dataloader_drop_last: true
dataloader_num_workers: 8
dataloader_pin_memory: true
fp16: false
gradient_accumulation_steps: 4
gradient_checkpointing: false
learning_rate: 0.0001
logging_steps: 30
max_epochs: 1
max_wall_time_hours: 150
min_lr_ratio: 0.1
num_checkpoints: 64
output_dir: /rna-multiverse/results/experiments/scaling_tiny_6e18_v22_mixed_glm_stage1_20251124
per_device_train_batch_size: 16
run_name: scaling_tiny_6e18_v22_mixed_glm_stage1
seed: 42
target_flops: 6.0e+18
warmup_ratio: 0.1
warmup_steps: 3000
weight_decay: 5.0e-06
training_strategies:
fast:
learning_rate: 2e-3
warmup_ratio: 0.05
fine:
learning_rate: 1e-4
warmup_ratio: 0.15
stable:
learning_rate: 5e-4
warmup_ratio: 0.1