data_config: cache_size: 10000 dataset: train_30M_sqrt_s42 enable_reverse_augmentation: true force_rebuild_index: false glm_probability: 0.333 lineage_file: /rna-multiverse/data/training_data/lineage_greengenes.tsv max_samples: null max_seq_length: 8192 mode: mixed span_config: allow_overlap: false coverage_probs: - 0.28 - 0.3 - 0.28 - 0.14 distribution_probs: - 0.3 - 0.5 - 0.2 max_coverage_ratios: - 0.15 - 0.25 - 0.5 - 0.8 max_num_spans: 10 span_distributions: - - 10 - 5 - - 20 - 10 - - 50 - 20 train_file: /rna-multiverse/data/cluster/sampling/training/train_30M_sqrt_s42_50only.fa use_chunked: true use_direction_tokens: true use_lineage_prefix: true distributed_config: backend: nccl data_parallel_size: 4 expert_parallel_size: 4 weight_parallel_size: 1 logging_config: enable_wandb: true log_dir: /rna-multiverse/results/experiments/scaling_tiny_6e18_v22_mixed_glm_stage1_20251124/logs wandb_project: rna-lineage-stage1 wandb_run_name: scaling_tiny_6e18_v22_mixed_glm_stage1 memory_config: cleanup_frequency: 100 enable_monitoring: true gc_frequency: 50 model_config: attention_dropout: 0.0 dropout_ramp_steps: 0 dropout_schedule: linear dropout_warmup_steps: 0 expert_capacity_factor: 1.5 gradient_clip_norm: 0.0 hidden_dropout: 0.0 hidden_size: 448 initializer_range: 0.02 intermediate_size: 1344 label_smoothing: 0.0 max_position_embeddings: 8192 moe_implementation: megablocks moe_world_size: 4 num_attention_heads: 7 num_experts: 8 num_experts_per_tok: 2 num_hidden_layers: 16 num_key_value_heads: 7 resid_dropout: 0.0 rms_norm_eps: 1e-6 router_aux_loss_coef: 0.01 use_cache: true vocab_size: 114 model_sizes: large: hidden_size: 1280 intermediate_size: 3840 num_attention_heads: 16 num_hidden_layers: 24 num_key_value_heads: 16 large_minus: hidden_size: 1024 intermediate_size: 3072 num_attention_heads: 16 num_hidden_layers: 22 num_key_value_heads: 16 medium: hidden_size: 768 intermediate_size: 2048 num_attention_heads: 12 num_hidden_layers: 24 num_key_value_heads: 12 medium_minus: hidden_size: 672 intermediate_size: 2016 num_attention_heads: 12 num_hidden_layers: 20 num_key_value_heads: 12 small: hidden_size: 512 intermediate_size: 1536 num_attention_heads: 8 num_hidden_layers: 17 num_key_value_heads: 8 tiny: hidden_size: 448 intermediate_size: 1344 num_attention_heads: 7 num_hidden_layers: 16 num_key_value_heads: 7 ultra_tiny: hidden_size: 256 intermediate_size: 768 num_attention_heads: 8 num_hidden_layers: 6 num_key_value_heads: 8 training_config: adam_beta1: 0.9 adam_beta2: 0.95 adam_epsilon: 1e-8 bf16: true dataloader_drop_last: true dataloader_num_workers: 8 dataloader_pin_memory: true fp16: false gradient_accumulation_steps: 4 gradient_checkpointing: false learning_rate: 0.0001 logging_steps: 30 max_epochs: 1 max_wall_time_hours: 150 min_lr_ratio: 0.1 num_checkpoints: 64 output_dir: /rna-multiverse/results/experiments/scaling_tiny_6e18_v22_mixed_glm_stage1_20251124 per_device_train_batch_size: 16 run_name: scaling_tiny_6e18_v22_mixed_glm_stage1 seed: 42 target_flops: 6.0e+18 warmup_ratio: 0.1 warmup_steps: 3000 weight_decay: 5.0e-06 training_strategies: fast: learning_rate: 2e-3 warmup_ratio: 0.05 fine: learning_rate: 1e-4 warmup_ratio: 0.15 stable: learning_rate: 5e-4 warmup_ratio: 0.1