| [model] |
| config = "fla-hub/hamilton-350M-15B" |
| tokenizer_path = "mistralai/Mistral-7B-v0.1" |
|
|
| [job] |
| dump_folder = "exp" |
| print_args = true |
|
|
| [training] |
| batch_size = 2 |
| seq_len = 2048 |
| context_len = 2048 |
| gradient_accumulation_steps = 1 |
| steps = 20480 |
| max_norm = 1.0 |
| skip_nan_inf = true |
| data_parallel_replicate_degree = 1 |
| data_parallel_shard_degree = -1 |
| tensor_parallel_degree = 1 |
| compile = false |
| dataset = "SlimPajama-627B" |
| dataset_name = "default" |
| num_workers = 32 |
| pin_memory = false |
| persistent_workers = false |
| prefetch_factor = 2 |
| seed = 42 |
| varlen = false |
|
|
| [optimizer] |
| name = "AdamW" |
| eps = 1e-15 |
| lr = 3e-4 |
|
|
| [lr_scheduler] |
| warmup_steps = 1024 |
| decay_type = "cosine" |
| lr_min = 0.1 |
|
|
| [checkpoint] |
| enable_checkpoint = true |
| folder = "checkpoint" |
| interval_type = "steps" |
| interval = 2048 |
| model_weights_only = false |
| export_dtype = "float32" |
| async_mode = "disabled" |
|
|
| [profiling] |
| enable_profiling = false |
| save_traces_folder = "profile_trace" |
| profile_freq = 256 |
|
|
| [metrics] |
| log_freq = 32 |
| enable_wandb = true |
|
|
| [experimental] |
| context_parallel_degree = 1 |
| pipeline_parallel_degree = 1 |
|
|
| [float8] |
| enable_fsdp_float8_all_gather = false |
| precompute_float8_dynamic_scale_for_fsdp = false |
|
|
| [activation_checkpoint] |
| mode = "none" |