| |
| seed_everything: 123 |
| trainer: |
| accelerator: gpu |
| strategy: |
| class_path: lightning.pytorch.strategies.DeepSpeedStrategy |
| init_args: |
| accelerator: null |
| zero_optimization: true |
| stage: 2 |
| remote_device: null |
| offload_optimizer: false |
| offload_parameters: true |
| offload_params_device: cpu |
| nvme_path: /local_nvme |
| params_buffer_count: 5 |
| params_buffer_size: 100000000 |
| max_in_cpu: 1000000000 |
| offload_optimizer_device: cpu |
| optimizer_buffer_count: 4 |
| block_size: 1048576 |
| queue_depth: 8 |
| single_submit: false |
| overlap_events: true |
| thread_count: 1 |
| pin_memory: true |
| sub_group_size: 1000000000000 |
| contiguous_gradients: true |
| overlap_comm: true |
| allgather_partitions: true |
| reduce_scatter: true |
| allgather_bucket_size: 200000000 |
| reduce_bucket_size: 200000000 |
| zero_allow_untested_optimizer: true |
| logging_batch_size_per_gpu: auto |
| config: null |
| logging_level: 30 |
| parallel_devices: null |
| cluster_environment: null |
| loss_scale: 0.0 |
| initial_scale_power: 16 |
| loss_scale_window: 1000 |
| hysteresis: 2 |
| min_loss_scale: 1 |
| partition_activations: false |
| cpu_checkpointing: false |
| contiguous_memory_optimization: false |
| synchronize_checkpoint_boundary: false |
| load_full_weights: false |
| precision_plugin: null |
| process_group_backend: null |
| devices: 8 |
| num_nodes: 1 |
| precision: bf16-true |
| logger: |
| class_path: lightning.pytorch.loggers.TensorBoardLogger |
| init_args: |
| save_dir: /media/logs |
| name: main |
| version: null |
| log_graph: false |
| default_hp_metric: true |
| prefix: '' |
| sub_dir: null |
| comment: '' |
| purge_step: null |
| max_queue: 10 |
| flush_secs: 120 |
| filename_suffix: '' |
| callbacks: null |
| fast_dev_run: false |
| max_epochs: 2 |
| min_epochs: null |
| max_steps: -1 |
| min_steps: null |
| max_time: null |
| limit_train_batches: null |
| limit_val_batches: null |
| limit_test_batches: null |
| limit_predict_batches: null |
| overfit_batches: 0.0 |
| val_check_interval: null |
| check_val_every_n_epoch: 1 |
| num_sanity_val_steps: 0 |
| log_every_n_steps: 1 |
| enable_checkpointing: null |
| enable_progress_bar: null |
| enable_model_summary: null |
| accumulate_grad_batches: 8 |
| gradient_clip_val: null |
| gradient_clip_algorithm: null |
| deterministic: null |
| benchmark: null |
| inference_mode: true |
| use_distributed_sampler: true |
| profiler: null |
| detect_anomaly: false |
| barebones: false |
| plugins: null |
| sync_batchnorm: false |
| reload_dataloaders_every_n_epochs: 0 |
| default_root_dir: null |
| model: |
| config: |
| model_name: Mistral-7B-v0.2 |
| dtype: bfloat16 |
| num_thoughts: 2 |
| thought_length: 8 |
| lookahead_tokens: 4 |
| embedding_grad_weights: 100.0 |
| temperature: 1.0 |
| do_sample: true |
| train_max_length: 120 |
| offload_cache: false |
| top_k: null |
| top_p: null |
| checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
| weight_decay: 0.001 |
| warmup_steps: 20 |
| policy_weight: 1.0 |
| init_lr: 1.0e-06 |
| optimizer: |
| class_path: torch.optim.AdamW |
| init_args: |
| lr: 1.0e-06 |
| betas: |
| - 0.9 |
| - 0.999 |
| eps: 1.0e-08 |
| weight_decay: 0.001 |
| amsgrad: false |
| maximize: false |
| foreach: null |
| capturable: false |
| differentiable: false |
| fused: null |
| scheduler: null |
| ckpt_path: null |
| data: |
| class_path: src.dataset.OpenWebMathDataModule |
| init_args: |
| data_path: /media/datasets/openwebmath |
| tokenizer: |
| class_path: src.dataset.SpecialTokenizer |
| init_args: |
| checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
| batch_size: 1 |
| max_seq_length: 120 |
| num_samples: 2048 |
| ignore_index: -100 |
| val_split_fraction: 0.125 |
| seed: 42 |
| num_workers: 1 |
|
|